diff --git a/go.mod b/go.mod
index f11d88a2f2..a6b325f64d 100644
--- a/go.mod
+++ b/go.mod
@@ -74,9 +74,9 @@ require (
 	github.com/charmbracelet/lipgloss v0.9.1
 	github.com/go-git/go-git/v5 v5.13.1
 	github.com/gowebpki/jcs v1.0.1
-	github.com/klauspost/compress v1.11.4
+	github.com/klauspost/compress v1.17.11
 	github.com/mark3labs/mcp-go v0.33.0
-	github.com/mholt/archiver/v3 v3.5.1
+	github.com/mholt/archives v0.1.3
 	github.com/zijiren233/yaml-comment v0.2.1
 )
 
@@ -85,7 +85,8 @@ require (
 	github.com/ActiveState/pty v0.0.0-20230628221854-6fb90eb08a14 // indirect
 	github.com/Microsoft/go-winio v0.6.1 // indirect
 	github.com/ProtonMail/go-crypto v1.1.3 // indirect
-	github.com/andybalholm/brotli v1.0.1 // indirect
+	github.com/STARRY-S/zip v0.2.1 // indirect
+	github.com/andybalholm/brotli v1.1.2-0.20250424173009-453214e765f3 // indirect
 	github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.11 // indirect
 	github.com/aws/aws-sdk-go-v2/credentials v1.17.71 // indirect
 	github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.33 // indirect
@@ -102,29 +103,39 @@ require (
 	github.com/aws/aws-sdk-go-v2/service/sts v1.34.1 // indirect
 	github.com/aws/smithy-go v1.22.4 // indirect
 	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
+	github.com/bodgit/plumbing v1.3.0 // indirect
+	github.com/bodgit/sevenzip v1.6.0 // indirect
+	github.com/bodgit/windows v1.0.1 // indirect
 	github.com/cloudflare/circl v1.6.1 // indirect
 	github.com/containerd/console v1.0.4-0.20230313162750-1ae8d489ac81 // indirect
 	github.com/cyphar/filepath-securejoin v0.3.6 // indirect
 	github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect
 	github.com/go-git/go-billy/v5 v5.6.1 // indirect
 	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
+	github.com/hashicorp/errwrap v1.1.0 // indirect
+	github.com/hashicorp/go-multierror v1.1.1 // indirect
 	github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
 	github.com/hinshun/vt10x v0.0.0-20220301184237-5011da428d02 // indirect
-	github.com/klauspost/pgzip v1.2.5 // indirect
+	github.com/klauspost/pgzip v1.2.6 // indirect
 	github.com/kr/pty v1.1.8 // indirect
 	github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
 	github.com/maruel/natural v1.1.0 // indirect
 	github.com/mattn/go-localereader v0.0.1 // indirect
+	github.com/mikelolasagasti/xz v1.0.1 // indirect
+	github.com/minio/minlz v1.0.0 // indirect
 	github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b // indirect
 	github.com/muesli/cancelreader v0.2.2 // indirect
 	github.com/muesli/reflow v0.3.0 // indirect
 	github.com/muesli/termenv v0.15.2 // indirect
-	github.com/pierrec/lz4/v4 v4.1.2 // indirect
+	github.com/nwaples/rardecode/v2 v2.1.0 // indirect
+	github.com/pierrec/lz4/v4 v4.1.21 // indirect
 	github.com/pjbgf/sha1cd v0.3.0 // indirect
 	github.com/shoenig/go-m1cpu v0.1.6 // indirect
 	github.com/skeema/knownhosts v1.3.0 // indirect
+	github.com/sorairolake/lzip-go v0.3.5 // indirect
 	github.com/sosodev/duration v1.3.1 // indirect
 	github.com/yosida95/uritemplate/v3 v3.0.2 // indirect
+	go4.org v0.0.0-20230225012048-214862532bf5 // indirect
 	golang.org/x/sync v0.16.0 // indirect
 )
 
@@ -138,7 +149,7 @@ require (
 	github.com/agnivade/levenshtein v1.1.1 // indirect
 	github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 // indirect
+	github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707 // indirect
 	github.com/emirpasic/gods v1.18.1 // indirect
 	github.com/gammazero/deque v0.0.0-20200721202602-07291166fe33 // indirect
 	github.com/go-openapi/analysis v0.20.0 // indirect
@@ -147,7 +158,6 @@ require (
 	github.com/go-openapi/loads v0.20.2 // indirect
 	github.com/go-openapi/spec v0.20.3 // indirect
 	github.com/go-stack/stack v1.8.0 // indirect
-	github.com/golang/snappy v0.0.4 // indirect
 	github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99 // indirect
 	github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
@@ -163,7 +173,6 @@ require (
 	github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d // indirect
 	github.com/mitchellh/mapstructure v1.5.0 // indirect
 	github.com/nu7hatch/gouuid v0.0.0-20131221200532-179d4d0c4d8d // indirect
-	github.com/nwaples/rardecode v1.1.3 // indirect
 	github.com/oklog/ulid v1.3.1 // indirect
 	github.com/opentracing/opentracing-go v1.2.0 // indirect
 	github.com/pelletier/go-toml v1.7.0 // indirect
@@ -179,7 +188,6 @@ require (
 	github.com/valyala/bytebufferpool v1.0.0 // indirect
 	github.com/valyala/fasttemplate v1.2.2 // indirect
 	github.com/xanzy/ssh-agent v0.3.3 // indirect
-	github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
 	github.com/yusufpapurcu/wmi v1.2.4 // indirect
 	go.mongodb.org/mongo-driver v1.5.3 // indirect
 	golang.org/x/mod v0.26.0
diff --git a/go.sum b/go.sum
index 235c0badab..e0eea27305 100644
--- a/go.sum
+++ b/go.sum
@@ -5,11 +5,16 @@ cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6A
 cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY=
 cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc=
 cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0=
+cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To=
+cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M=
 cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
+cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE=
 cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
 cloud.google.com/go/firestore v1.1.0/go.mod h1:ulACoGHTpvq5r8rxGJ4ddJZBZqakUQqClKRT5SZwBmk=
 cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I=
+cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw=
 cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
+cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos=
 dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk=
 dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
 dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
@@ -45,6 +50,8 @@ github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tN
 github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
 github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M=
 github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
+github.com/STARRY-S/zip v0.2.1 h1:pWBd4tuSGm3wtpoqRZZ2EAwOmcHK6XFf7bU9qcJXyFg=
+github.com/STARRY-S/zip v0.2.1/go.mod h1:xNvshLODWtC4EJ702g7cTYn13G53o1+X9BWnPFpcWV4=
 github.com/VividCortex/ewma v1.2.0 h1:f58SaIzcDXrSy3kWaHNvuJgJ3Nmz59Zji6XoJR/q1ow=
 github.com/VividCortex/ewma v1.2.0/go.mod h1:nz4BbCtbLyFDeC9SUHbtcT5644juEuWfUAUnGx7j5l4=
 github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d h1:licZJFw2RwpHMqeKTCYkitsPqHNxTmd4SNR5r94FGM8=
@@ -64,8 +71,8 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuy
 github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
 github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNgfBlViaCIJKLlCJ6/fmUseuG0wVQ=
 github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8=
-github.com/andybalholm/brotli v1.0.1 h1:KqhlKozYbRtJvsPrrEeXcO+N2l6NYT5A2QAFmSULpEc=
-github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
+github.com/andybalholm/brotli v1.1.2-0.20250424173009-453214e765f3 h1:8PmGpDEZl9yDpcdEr6Odf23feCxK3LNUNMxjXg41pZQ=
+github.com/andybalholm/brotli v1.1.2-0.20250424173009-453214e765f3/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA=
 github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
 github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
 github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8=
@@ -131,8 +138,15 @@ github.com/blang/semver v3.5.1+incompatible h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdn
 github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
 github.com/bmatcuk/doublestar/v4 v4.7.1 h1:fdDeAqgT47acgwd9bd9HxJRDmc9UAmPpc+2m0CXv75Q=
 github.com/bmatcuk/doublestar/v4 v4.7.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc=
+github.com/bodgit/plumbing v1.3.0 h1:pf9Itz1JOQgn7vEOE7v7nlEfBykYqvUYioC61TwWCFU=
+github.com/bodgit/plumbing v1.3.0/go.mod h1:JOTb4XiRu5xfnmdnDJo6GmSbSbtSyufrsyZFByMtKEs=
+github.com/bodgit/sevenzip v1.6.0 h1:a4R0Wu6/P1o1pP/3VV++aEOcyeBxeO/xE2Y9NSTrr6A=
+github.com/bodgit/sevenzip v1.6.0/go.mod h1:zOBh9nJUof7tcrlqJFv1koWRrhz3LbDbUNngkuZxLMc=
+github.com/bodgit/windows v1.0.1 h1:tF7K6KOluPYygXa3Z2594zxlkbKPAOvqr97etrGNIz4=
+github.com/bodgit/windows v1.0.1/go.mod h1:a6JLwrB4KrTR5hBpp8FI9/9W9jJfeQ2h4XDXU74ZCdM=
 github.com/brunoga/deep v1.2.4 h1:Aj9E9oUbE+ccbyh35VC/NHlzzjfIVU69BXu2mt2LmL8=
 github.com/brunoga/deep v1.2.4/go.mod h1:GDV6dnXqn80ezsLSZ5Wlv1PdKAWAO4L5PnKYtv2dgaI=
+github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
 github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko=
 github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
 github.com/charmbracelet/bubbles v0.18.0 h1:PYv1A036luoBGroX6VWjQIE9Syf2Wby2oOl/39KLfy0=
@@ -172,8 +186,8 @@ github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
 github.com/docker/go-units v0.3.3/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
 github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
-github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 h1:iFaUwBSo5Svw6L7HYpRu/0lE3e0BaElwnNO1qkNQxBY=
-github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s=
+github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707 h1:2tV76y6Q9BB+NEBasnqvs7e49aEBFI8ejC89PSnWH+4=
+github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s=
 github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
@@ -182,6 +196,8 @@ github.com/elazarl/goproxy v1.2.3 h1:xwIyKHbaP5yfT6O9KIeYJR5549MXRQkoQMRXGztz8YQ
 github.com/elazarl/goproxy v1.2.3/go.mod h1:YfEbZtqP4AetfO6d40vWchF3znWX7C7Vd6ZMfdL8z64=
 github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
 github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
+github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
 github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
 github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
 github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
@@ -209,6 +225,7 @@ github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399/go.mod
 github.com/go-git/go-git/v5 v5.13.1 h1:DAQ9APonnlvSWpvolXWIuV6Q6zXy2wHbN4cVlNR5Q+M=
 github.com/go-git/go-git/v5 v5.13.1/go.mod h1:qryJB4cSBoq3FRoBRf5A77joojuBcmPJ0qu3XXXVixc=
 github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
+github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
 github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
 github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
 github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk=
@@ -341,22 +358,26 @@ github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7a
 github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
 github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
 github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
 github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
 github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
+github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw=
 github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
 github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
-github.com/golang/snappy v0.0.2/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
-github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
-github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
 github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
@@ -366,6 +387,7 @@ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN
 github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
 github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
 github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
+github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
 github.com/google/pprof v0.0.0-20200615235658-03e1cf38a040/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
 github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99 h1:Ak8CrdlwwXwAZxzS66vgPt4U8yUZX7JwLvVR58FN5jM=
 github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM=
@@ -388,6 +410,8 @@ github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t
 github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q=
 github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8=
 github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
+github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I=
+github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
 github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80=
 github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ=
 github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48=
@@ -396,6 +420,8 @@ github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVH
 github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60=
 github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM=
 github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk=
+github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
+github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
 github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU=
 github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk=
 github.com/hashicorp/go-rootcerts v1.0.0/go.mod h1:K6zTfqpRlCUIjkwsN4Z+hiSfzSTQa6eBIzfwKfwNnHU=
@@ -441,6 +467,7 @@ github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8Hm
 github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
 github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
+github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
 github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
 github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
 github.com/kami-zh/go-capturer v0.0.0-20171211120116-e492ea43421d h1:cVtBfNW5XTHiKQe7jDaDBSh/EVM4XLPutLAGboIXuM0=
@@ -456,11 +483,11 @@ github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQL
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.9.5/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
-github.com/klauspost/compress v1.11.4 h1:kz40R/YWls3iqT9zX9AHN3WoVsrAWVyui5sxuLqiXqU=
-github.com/klauspost/compress v1.11.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
+github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc=
+github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
 github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
-github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE=
-github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
+github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
+github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
@@ -524,9 +551,13 @@ github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5
 github.com/mgutz/ansi v0.0.0-20170206155736-9520e82c474b/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE=
 github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d h1:5PJl274Y63IEHC+7izoQE9x6ikvDFZS2mDVS3drnohI=
 github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d/go.mod h1:01TrycV0kFyexm33Z7vhZRXopbI8J3TDReVlkTgMUxE=
-github.com/mholt/archiver/v3 v3.5.1 h1:rDjOBX9JSF5BvoJGvjqK479aL70qh9DIpZCl+k7Clwo=
-github.com/mholt/archiver/v3 v3.5.1/go.mod h1:e3dqJ7H78uzsRSEACH1joayhuSyhnonssnDhppzS1L4=
+github.com/mholt/archives v0.1.3 h1:aEAaOtNra78G+TvV5ohmXrJOAzf++dIlYeDW3N9q458=
+github.com/mholt/archives v0.1.3/go.mod h1:LUCGp++/IbV/I0Xq4SzcIR6uwgeh2yjnQWamjRQfLTU=
 github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg=
+github.com/mikelolasagasti/xz v1.0.1 h1:Q2F2jX0RYJUG3+WsM+FJknv+6eVjsjXNDV0KJXZzkD0=
+github.com/mikelolasagasti/xz v1.0.1/go.mod h1:muAirjiOUxPRXwm9HdDtB3uoRPrGnL85XHtokL9Hcgc=
+github.com/minio/minlz v1.0.0 h1:Kj7aJZ1//LlTP1DM8Jm7lNKvvJS2m74gyyXXn3+uJWQ=
+github.com/minio/minlz v1.0.0/go.mod h1:qT0aEB35q79LLornSzeDH75LBf3aH1MV+jB5w9Wasec=
 github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc=
 github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
 github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
@@ -558,9 +589,8 @@ github.com/nicksnyder/go-i18n v1.10.0/go.mod h1:HrK7VCrbOvQoUAQ7Vpy7i87N7JZZZ7R2
 github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
 github.com/nu7hatch/gouuid v0.0.0-20131221200532-179d4d0c4d8d h1:VhgPp6v9qf9Agr/56bj7Y/xa04UccTW04VP0Qed4vnQ=
 github.com/nu7hatch/gouuid v0.0.0-20131221200532-179d4d0c4d8d/go.mod h1:YUTz3bUH2ZwIWBy3CJBeOBEugqcmXREj14T+iG/4k4U=
-github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
-github.com/nwaples/rardecode v1.1.3 h1:cWCaZwfM5H7nAD6PyEdcVnczzV8i/JtotnyW/dD9lEc=
-github.com/nwaples/rardecode v1.1.3/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
+github.com/nwaples/rardecode/v2 v2.1.0 h1:JQl9ZoBPDy+nIZGb1mx8+anfHp/LV3NE2MjMiv0ct/U=
+github.com/nwaples/rardecode/v2 v2.1.0/go.mod h1:7uz379lSxPe6j9nvzxUZ+n7mnJNgjsRNb6IbvGVHRmw=
 github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4=
 github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
 github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k=
@@ -577,8 +607,8 @@ github.com/pelletier/go-toml v1.7.0 h1:7utD74fnzVc/cpcyy8sjrlFr5vYpypUixARcHIMIG
 github.com/pelletier/go-toml v1.7.0/go.mod h1:vwGMzjaWMwyfHwgIBhI2YUM4fB6nL6lVAvS1LBMMhTE=
 github.com/phayes/permbits v0.0.0-20190108233746-1efae4548023 h1:qGtiKAVHKJpZywKrfhpExKGr4PkkRQbgGxoZP4kqVfE=
 github.com/phayes/permbits v0.0.0-20190108233746-1efae4548023/go.mod h1:3uODdxMgOaPYeWU7RzZLxVtJHZ/x1f/iHkBZuKJDzuY=
-github.com/pierrec/lz4/v4 v4.1.2 h1:qvY3YFXRQE/XB8MlLzJH7mSzBs74eA2gg52YTk6jUPM=
-github.com/pierrec/lz4/v4 v4.1.2/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
+github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ=
+github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
 github.com/pjbgf/sha1cd v0.3.0 h1:4D5XXmUUBUl/xQ6IjCkEAbqXskkq/4O7LmGn0AqMDs4=
 github.com/pjbgf/sha1cd v0.3.0/go.mod h1:nZ1rrWOcGJ5uZgEEVL1VUM9iRQiZvWdbZjkKyFzPPsI=
 github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
@@ -596,6 +626,7 @@ github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXP
 github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso=
 github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
 github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/prometheus/common v0.0.0-20181113130724-41aa239b4cce/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
 github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
 github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
@@ -616,6 +647,7 @@ github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99
 github.com/rollbar/rollbar-go v1.1.0 h1:3ysiHp3ep8W50ykgBMCKXJGaK2Jdivru7SW9EYfAo+M=
 github.com/rollbar/rollbar-go v1.1.0/go.mod h1:AcFs5f0I+c71bpHlXNNDbOWJiKwjFDtISeXco0L5PKQ=
 github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd/go.mod h1:hPqNNc0+uJM6H+SuU8sEs5K5IQeKccPqeSjfgcKGgPk=
 github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
 github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
 github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
@@ -640,6 +672,8 @@ github.com/skratchdot/open-golang v0.0.0-20190104022628-a2dfa6d0dab6/go.mod h1:s
 github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
 github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
 github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM=
+github.com/sorairolake/lzip-go v0.3.5 h1:ms5Xri9o1JBIWvOFAorYtUNik6HI3HgBTkISiqu0Cwg=
+github.com/sorairolake/lzip-go v0.3.5/go.mod h1:N0KYq5iWrMXI0ZEXKXaS9hCyOjZUQdBDEIbXfoUwbdk=
 github.com/sosodev/duration v1.3.1 h1:qtHBDMQ6lvMQsL15g4aopM4HEfOaYuhWBw3NPTtlqq4=
 github.com/sosodev/duration v1.3.1/go.mod h1:RQIBBX0+fMLc/D9+Jb/fwvVmo0eZvDDEERAikUR6SDg=
 github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72 h1:qLC7fQah7D6K1B0ujays3HV9gkFtllcxhzImRR7ArPQ=
@@ -659,12 +693,17 @@ github.com/spf13/viper v1.7.0/go.mod h1:8WkrPz2fc9jxqZNCJI/76HCieCp4Q8HaLFoCha5q
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
 github.com/stretchr/testify v1.2.1/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
 github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw=
@@ -678,7 +717,6 @@ github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+F
 github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY=
 github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
 github.com/ulikunitz/xz v0.5.8/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
-github.com/ulikunitz/xz v0.5.9/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
 github.com/ulikunitz/xz v0.5.14 h1:uv/0Bq533iFdnMHZdRBTOlaNMdb1+ZxXIlHDZHIHcvg=
 github.com/ulikunitz/xz v0.5.14/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
 github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
@@ -697,13 +735,14 @@ github.com/xdg-go/scram v1.0.2/go.mod h1:1WAq6h33pAW+iRreB34OORO2Nf7qel3VV3fjBj+
 github.com/xdg-go/stringprep v1.0.2/go.mod h1:8F9zXuvzgwmyT5DUm4GUfZGDdT3W+LCvS6+da4O5kxM=
 github.com/xdg/scram v0.0.0-20180814205039-7eeb5667e42c/go.mod h1:lB8K/P019DLNhemzwFU4jHLhdvlE6uDZjXFejJXr49I=
 github.com/xdg/stringprep v0.0.0-20180714160509-73f8eece6fdc/go.mod h1:Jhud4/sHMO4oL310DaZAKk9ZaJ08SJfe+sJh0HrGL1Y=
-github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo=
-github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMxjDjgmT5uz5wzYJKVo23qUhYTos=
 github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=
+github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
+github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
 github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4=
 github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4=
 github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
 github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
 github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
 github.com/zijiren233/yaml-comment v0.2.1 h1:/ymMfauuR6zPme+c59FvGNmvxmjOS+BRZSU9YEM82g4=
@@ -721,11 +760,15 @@ go.mongodb.org/mongo-driver v1.5.3 h1:wWbFB6zaGHpzguF3f7tW94sVE8sFl3lHx8OZx/4OuF
 go.mongodb.org/mongo-driver v1.5.3/go.mod h1:gRXCHX4Jo7J0IJ1oDQyUxF7jfy19UfxniMS4xxMmUqw=
 go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
 go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
+go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
+go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
 go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
 go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk=
 go.uber.org/goleak v1.2.0/go.mod h1:XJYK+MuIchqpmGmUSAzotztawfKvYLUIgg7guXrwVUo=
 go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
 go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
+go4.org v0.0.0-20230225012048-214862532bf5 h1:nifaUDeh+rPaBCMPMQHZmvJf+QdpLFnuQPwx+LxVmtc=
+go4.org v0.0.0-20230225012048-214862532bf5/go.mod h1:F57wTi5Lrj6WLyswp5EYV1ncrEbFGHD4hhz6S1ZYeaU=
 golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
 golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
@@ -739,6 +782,7 @@ golang.org/x/crypto v0.0.0-20190617133340-57b3e21c3d56/go.mod h1:yigFU9vqHzYiE8U
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200302210943-78000ba7a073/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4=
 golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc=
@@ -747,6 +791,9 @@ golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL
 golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
 golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek=
 golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY=
+golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
+golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
+golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM=
 golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
 golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
@@ -758,11 +805,16 @@ golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHl
 golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
 golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
 golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs=
+golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
 golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
 golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
 golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc=
 golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
+golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
+golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
 golang.org/x/mod v0.26.0 h1:EGMPT//Ezu+ylkCijjPc+f4Aih7sZvaAr+O3EHBxvZg=
 golang.org/x/mod v0.26.0/go.mod h1:/j6NAhSk8iQ723BGAUyoAcn7SlD7s15Dp9Nd/SfeaFQ=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -782,7 +834,9 @@ golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn
 golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
 golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
@@ -791,13 +845,18 @@ golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwY
 golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 golang.org/x/net v0.0.0-20210421230115-4e50805a0758/go.mod h1:72T/g9IO56b78aLF+1Kcs5dz7/ng1VjMUvfKvpfy+jM=
 golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
 golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
 golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
+golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
+golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -806,6 +865,7 @@ golang.org/x/sync v0.0.0-20190412183630-56d357773e84/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw=
 golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
 golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -829,10 +889,13 @@ golang.org/x/sys v0.0.0-20190531175056-4c3a928424d2/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190616124812-15dcb6c0061f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -843,17 +906,23 @@ golang.org/x/sys v0.0.0-20210420072515-93ed5bcd2bfe/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220721230656-c6bc011c0c49/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI=
 golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
 golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4=
 golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw=
+golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
@@ -861,6 +930,8 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng=
 golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
@@ -882,6 +953,7 @@ golang.org/x/tools v0.0.0-20190416151739-9c9e1878f421/go.mod h1:LCzVGOaR6xXOjkQ3
 golang.org/x/tools v0.0.0-20190420181800-aa740d480789/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
 golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
 golang.org/x/tools v0.0.0-20190531172133-b3315ee88b7d/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
 golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
 golang.org/x/tools v0.0.0-20190614205625-5aca471b1d59/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
@@ -892,8 +964,17 @@ golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtn
 golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191112195655-aa38f8e97acc/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
 golang.org/x/tools v0.0.0-20201124115921-2c860bdd6e78/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
 golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0=
 golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@@ -905,10 +986,14 @@ google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E
 google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
 google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
 google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
+google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
+google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
+google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE=
 google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
 google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
 google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
 google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0=
+google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
 google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
 google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
 google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
@@ -918,9 +1003,17 @@ google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98
 google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
 google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8=
 google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
 google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
 google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
 google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
+google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
 gopkg.in/AlecAivazis/survey.v1 v1.8.8 h1:5UtTowJZTz1j7NxVzDGKTz6Lm9IWm8DDF6b7a2wq9VY=
 gopkg.in/AlecAivazis/survey.v1 v1.8.8/go.mod h1:CaHjv79TCgAvXMSFJSVgonHXYWxnhzI3eoHtnX5UgUo=
 gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
@@ -959,6 +1052,7 @@ gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
 howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM=
 howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g=
@@ -993,3 +1087,5 @@ modernc.org/token v1.0.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
 modernc.org/z v1.0.1 h1:WyIDpEpAIx4Hel6q/Pcgj/VhaQV5XPJ2I6ryIYbjnpc=
 modernc.org/z v1.0.1/go.mod h1:8/SRk5C/HgiQWCgXdfpb+1RvhORdkz5sw72d3jjtyqA=
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
+rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
+rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go
index e8ece29276..a7681f8f78 100644
--- a/internal/archiver/archiver.go
+++ b/internal/archiver/archiver.go
@@ -1,66 +1,505 @@
+// Package archiver provides archive functionality using the modern archives library
 package archiver
 
 import (
+	"archive/tar"
+	"archive/zip"
+	"bytes"
+	"compress/gzip"
+	"context"
+	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"strings"
 
-	"github.com/ActiveState/cli/internal/errs"
 	"github.com/ActiveState/cli/internal/fileutils"
-	"github.com/mholt/archiver/v3"
+	"github.com/mholt/archives"
 )
 
-type FileMap struct {
-	Source string
-	Target string // Note: Target paths should always be relative to the archive root, do not use absolute paths
+// sanitizeArchivePath validates and sanitizes archive entry paths to prevent path traversal attacks
+func sanitizeArchivePath(path string) (string, error) {
+	if path == "" {
+		return "", fmt.Errorf("empty path not allowed")
+	}
+
+	for _, r := range path {
+		if r == 0 || (r < 32 && r != '\t' && r != '\n' && r != '\r') {
+			return "", fmt.Errorf("path contains invalid characters: %s", path)
+		}
+	}
+
+	// Check for raw ".." sequences in the original path (before cleaning)
+	// This catches cases like "../file.txt" or "dir/../../file.txt"
+	// But allow "..." as it's a valid filename
+	if strings.Contains(path, "..") && !strings.Contains(path, "...") {
+		return "", fmt.Errorf("path contains directory traversal sequence: %s", path)
+	}
+
+	// Check for Windows absolute paths (C:, D:, etc.)
+	if len(path) >= 2 && path[1] == ':' && ((path[0] >= 'A' && path[0] <= 'Z') || (path[0] >= 'a' && path[0] <= 'z')) {
+		return "", fmt.Errorf("absolute path not allowed: %s", path)
+	}
+
+	// Check for paths starting with backslashes (Windows)
+	if strings.HasPrefix(path, "\\") {
+		return "", fmt.Errorf("path cannot start with backslash: %s", path)
+	}
+
+	// Normalize separators to forward slashes first (cross-platform)
+	normalizedPath := strings.ReplaceAll(path, "\\", "/")
+
+	cleanPath := filepath.Clean(normalizedPath)
+
+	// Check if the cleaned path contains any remaining ".." components
+	// This is a double-check in case filepath.Clean didn't catch everything
+	if strings.Contains(cleanPath, "..") && !strings.Contains(cleanPath, "...") {
+		return "", fmt.Errorf("path contains directory traversal sequence after cleaning: %s", path)
+	}
+
+	if cleanPath == "" {
+		return "", fmt.Errorf("empty or invalid path: %s", path)
+	}
+
+	// Allow root directory "/" as it's a valid entry in TAR archives
+	if cleanPath == "/" {
+		return cleanPath, nil
+	}
+
+	if filepath.IsAbs(cleanPath) {
+		return "", fmt.Errorf("absolute path not allowed: %s", path)
+	}
+
+	// Allow "." as it represents the current directory (common in TAR archives)
+	if cleanPath == "." {
+		return cleanPath, nil
+	}
+
+	// Strip leading path separator if present (common in some archive formats)
+	if strings.HasPrefix(cleanPath, string(filepath.Separator)) {
+		cleanPath = cleanPath[1:]
+	}
+
+	// Additional check: ensure the path doesn't start with path separators after stripping
+	if strings.HasPrefix(cleanPath, string(filepath.Separator)) {
+		return "", fmt.Errorf("path cannot start with path separator: %s", path)
+	}
+
+	return cleanPath, nil
 }
 
-func CreateTgz(archivePath string, workDir string, fileMaps []FileMap) error {
-	f, err := os.OpenFile(archivePath, os.O_CREATE|os.O_WRONLY, 0644)
-	if err != nil {
-		return errs.Wrap(err, "Could not create temp file")
+// File represents a file in an archive
+type File struct {
+	io.ReadCloser
+	Header interface{}
+}
+
+// getHeaderName safely extracts the name from archive headers and validates it
+func (f File) getHeaderName() (string, error) {
+	var rawName string
+	if header, ok := f.Header.(*tar.Header); ok {
+		rawName = header.Name
+	} else if header, ok := f.Header.(zip.FileHeader); ok {
+		rawName = header.Name
+	} else {
+		return "", fmt.Errorf("unknown header type")
 	}
-	defer f.Close()
-	tgz := archiver.NewTarGz()
-	if err := tgz.Create(f); err != nil {
-		return errs.Wrap(err, "Could not create tar.gz")
+
+	// Always sanitize the path to prevent path traversal attacks
+	return sanitizeArchivePath(rawName)
+}
+
+// getRawHeaderName extracts the raw name from archive headers without validation
+// This is used for cases where we need to handle unknown header types gracefully
+func (f File) getRawHeaderName() (string, bool) {
+	if header, ok := f.Header.(*tar.Header); ok {
+		return header.Name, true
+	} else if header, ok := f.Header.(zip.FileHeader); ok {
+		return header.Name, true
 	}
-	defer tgz.Close()
+	return "", false
+}
 
-	for _, fileMap := range fileMaps {
-		source := fileMap.Source
-		if !filepath.IsAbs(source) {
-			// Ensure the source path is absolute, because otherwise it will use the global working directory which
-			// we're not interested in.
-			source = filepath.Join(workDir, source)
-		}
-		file, err := os.Open(source)
+// Name returns the name of the file
+func (f File) Name() string {
+	// First check if we have a known header type
+	if rawName, ok := f.getRawHeaderName(); ok {
+		// We have a known header type, sanitize the path
+		sanitizedPath, err := sanitizeArchivePath(rawName)
 		if err != nil {
-			return errs.Wrap(err, "Could not open file")
+			// If sanitization fails, return a safe default
+			return "invalid_path"
 		}
+		return filepath.Base(sanitizedPath)
+	}
 
-		fileInfo, err := file.Stat()
-		if err != nil {
-			return errs.Wrap(err, "Could not stat file")
-		}
+	// Unknown header type, return empty string for backward compatibility
+	return ""
+}
 
-		// write it to the archive
-		err = tgz.Write(archiver.File{
-			FileInfo: archiver.FileInfo{
-				FileInfo:   fileInfo,
-				CustomName: fileMap.Target,
-			},
-			ReadCloser: file,
-		})
-		file.Close()
-		if err != nil {
-			return errs.Wrap(err, "Could not write file to tar.gz")
+// FullPath returns the full sanitized path of the file
+func (f File) FullPath() (string, error) {
+	return f.getHeaderName()
+}
+
+// Size returns the size of the file
+func (f File) Size() int64 {
+	if header, ok := f.Header.(*tar.Header); ok {
+		return header.Size
+	}
+	if header, ok := f.Header.(zip.FileHeader); ok {
+		return header.FileInfo().Size()
+	}
+	return 0
+}
+
+// IsDir checks if the file is a directory
+func (f File) IsDir() bool {
+	if header, ok := f.Header.(*tar.Header); ok {
+		return header.FileInfo().IsDir()
+	}
+	if header, ok := f.Header.(zip.FileHeader); ok {
+		return header.FileInfo().IsDir()
+	}
+	return false
+}
+
+// Mode returns the file mode
+func (f File) Mode() os.FileMode {
+	if header, ok := f.Header.(*tar.Header); ok {
+		return header.FileInfo().Mode()
+	}
+	if header, ok := f.Header.(zip.FileHeader); ok {
+		return header.FileInfo().Mode()
+	}
+	return 0
+}
+
+// FileInfo represents file metadata
+type FileInfo struct {
+	os.FileInfo
+	CustomName string
+}
+
+// Name returns the custom name if set, otherwise the original name
+func (fi FileInfo) Name() string {
+	if fi.CustomName != "" {
+		return fi.CustomName
+	}
+	return fi.FileInfo.Name()
+}
+
+// Reader interface for reading archives
+type Reader interface {
+	Open(archiveStream io.Reader, archiveSize int64) error
+	Read() (File, error)
+	Close() error
+}
+
+// Archiver interface for creating archives
+type Archiver interface {
+	Archive(files []string, destination string) error
+}
+
+// Zip implements the Archiver interface for ZIP files
+type Zip struct {
+	OverwriteExisting bool
+	reader            *zip.Reader
+	currentFile       int
+	data              []byte
+}
+
+// NewZip creates a new ZIP archiver
+func NewZip() *Zip {
+	return &Zip{}
+}
+
+// Archive creates a ZIP archive from the given files
+func (z *Zip) Archive(files []string, destination string) error {
+	ctx := context.Background()
+
+	// Create output file
+	outFile, err := os.Create(destination)
+	if err != nil {
+		return err
+	}
+	defer outFile.Close()
+
+	// Convert file paths to FileInfo slice
+	fileMap := make(map[string]string)
+	for _, file := range files {
+		fileMap[file] = filepath.Base(file)
+	}
+
+	fileInfos, err := archives.FilesFromDisk(ctx, nil, fileMap)
+	if err != nil {
+		return err
+	}
+
+	// Create ZIP archive
+	zip := &archives.Zip{}
+	return zip.Archive(ctx, outFile, fileInfos)
+}
+
+// CheckExt checks if the file extension is appropriate for ZIP
+func (z *Zip) CheckExt(archiveName string) error {
+	if !strings.HasSuffix(strings.ToLower(archiveName), ".zip") {
+		return fmt.Errorf("file %s does not have .zip extension", archiveName)
+	}
+	return nil
+}
+
+// Ext returns the file extension for ZIP files
+func (z *Zip) Ext() string {
+	return ".zip"
+}
+
+// Open opens a ZIP archive for reading
+func (z *Zip) Open(archiveStream io.Reader, archiveSize int64) error {
+	// Read the entire stream into memory since zip.NewReader requires io.ReaderAt
+	data, err := io.ReadAll(archiveStream)
+	if err != nil {
+		return fmt.Errorf("failed to read archive data: %w", err)
+	}
+
+	// Create a reader from the data
+	reader, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
+	if err != nil {
+		return fmt.Errorf("failed to create zip reader: %w", err)
+	}
+
+	z.reader = reader
+	z.currentFile = 0
+	z.data = data
+	return nil
+}
+
+// Read reads the next file from the ZIP archive
+func (z *Zip) Read() (File, error) {
+	if z.reader == nil {
+		return File{}, fmt.Errorf("archive not opened")
+	}
+
+	if z.currentFile >= len(z.reader.File) {
+		return File{}, io.EOF
+	}
+
+	// Access file object - path validation happens immediately after
+	file := z.reader.File[z.currentFile]
+
+	// Validate and sanitize the file path to prevent path traversal attacks
+	// This validation happens before any file operations to ensure security
+	_, err := sanitizeArchivePath(file.Name)
+	if err != nil {
+		return File{}, fmt.Errorf("invalid file path in archive: %w", err)
+	}
+
+	z.currentFile++
+
+	rc, err := file.Open()
+	if err != nil {
+		return File{}, fmt.Errorf("failed to open file in zip: %w", err)
+	}
+
+	return File{
+		ReadCloser: rc,
+		Header:     file.FileHeader,
+	}, nil
+}
+
+// Close closes the ZIP archive
+func (z *Zip) Close() error {
+	z.reader = nil
+	z.currentFile = 0
+	z.data = nil
+	return nil
+}
+
+// TarGz implements the Archiver interface for tar.gz files
+type TarGz struct {
+	OverwriteExisting bool
+	Tar               *Tar
+	reader            *tar.Reader
+	gzipReader        io.ReadCloser
+}
+
+// NewTarGz creates a new tar.gz archiver
+func NewTarGz() *TarGz {
+	return &TarGz{}
+}
+
+// Archive creates a tar.gz archive from the given files
+func (tgz *TarGz) Archive(files []string, destination string) error {
+	ctx := context.Background()
+
+	// Create output file
+	outFile, err := os.Create(destination)
+	if err != nil {
+		return err
+	}
+	defer outFile.Close()
+
+	// Convert file paths to FileInfo slice
+	fileMap := make(map[string]string)
+	for _, file := range files {
+		fileMap[file] = filepath.Base(file)
+	}
+
+	fileInfos, err := archives.FilesFromDisk(ctx, nil, fileMap)
+	if err != nil {
+		return err
+	}
+
+	// Create tar.gz archive using CompressedArchive
+	compressedArchive := archives.CompressedArchive{
+		Compression: archives.Gz{},
+		Archival:    archives.Tar{},
+	}
+	return compressedArchive.Archive(ctx, outFile, fileInfos)
+}
+
+// CheckExt checks if the file extension is appropriate for tar.gz
+func (tgz *TarGz) CheckExt(archiveName string) error {
+	if !strings.HasSuffix(strings.ToLower(archiveName), ".tar.gz") {
+		return fmt.Errorf("file %s does not have .tar.gz extension", archiveName)
+	}
+	return nil
+}
+
+// Ext returns the file extension for tar.gz files
+func (tgz *TarGz) Ext() string {
+	return ".tar.gz"
+}
+
+// Open opens a tar.gz archive for reading
+func (tgz *TarGz) Open(archiveStream io.Reader, archiveSize int64) error {
+	// Create gzip reader
+	gzReader, err := gzip.NewReader(archiveStream)
+	if err != nil {
+		return fmt.Errorf("failed to create gzip reader: %w", err)
+	}
+
+	// Create tar reader
+	tarReader := tar.NewReader(gzReader)
+
+	tgz.reader = tarReader
+	tgz.gzipReader = gzReader
+	return nil
+}
+
+// Read reads the next file from the tar.gz archive
+func (tgz *TarGz) Read() (File, error) {
+	if tgz.reader == nil {
+		return File{}, fmt.Errorf("archive not opened")
+	}
+
+	header, err := tgz.reader.Next()
+	if err != nil {
+		if err == io.EOF {
+			return File{}, io.EOF
 		}
+		return File{}, fmt.Errorf("failed to read tar header: %w", err)
+	}
+
+	// Validate and sanitize the file path to prevent path traversal attacks
+	_, err = sanitizeArchivePath(header.Name)
+	if err != nil {
+		return File{}, fmt.Errorf("invalid file path in archive: %w", err)
 	}
 
+	return File{
+		ReadCloser: &tarFileReader{
+			reader: tgz.reader,
+			size:   header.Size,
+		},
+		Header: header,
+	}, nil
+}
+
+// Close closes the tar.gz archive
+func (tgz *TarGz) Close() error {
+	if tgz.gzipReader != nil {
+		tgz.gzipReader.Close()
+		tgz.gzipReader = nil
+	}
+	tgz.reader = nil
+	return nil
+}
+
+// tarFileReader wraps a tar.Reader to implement io.ReadCloser
+type tarFileReader struct {
+	reader *tar.Reader
+	size   int64
+	read   int64
+}
+
+func (tfr *tarFileReader) Read(p []byte) (n int, err error) {
+	if tfr.read >= tfr.size {
+		return 0, io.EOF
+	}
+
+	// Limit read to remaining size
+	remaining := tfr.size - tfr.read
+	if int64(len(p)) > remaining {
+		p = p[:remaining]
+	}
+
+	n, err = tfr.reader.Read(p)
+	tfr.read += int64(n)
+	return n, err
+}
+
+func (tfr *tarFileReader) Close() error {
+	// For tar files, we don't need to close anything specific
 	return nil
 }
 
+// Tar represents a tar archive (used within TarGz)
+type Tar struct {
+	StripComponents int
+}
+
+// CreateTgz creates a tar.gz archive with the given file mappings
+func CreateTgz(archivePath string, workDir string, fileMaps []FileMap) error {
+	ctx := context.Background()
+
+	// Create output file
+	outFile, err := os.Create(archivePath)
+	if err != nil {
+		return err
+	}
+	defer outFile.Close()
+
+	// Convert fileMaps to FileInfo slice
+	fileMap := make(map[string]string)
+	for _, fileMapItem := range fileMaps {
+		source := fileMapItem.Source
+		if !filepath.IsAbs(source) {
+			source = filepath.Join(workDir, source)
+		}
+		fileMap[source] = fileMapItem.Target
+	}
+
+	fileInfos, err := archives.FilesFromDisk(ctx, nil, fileMap)
+	if err != nil {
+		return err
+	}
+
+	// Create tar.gz archive using CompressedArchive
+	compressedArchive := archives.CompressedArchive{
+		Compression: archives.Gz{},
+		Archival:    archives.Tar{},
+	}
+	return compressedArchive.Archive(ctx, outFile, fileInfos)
+}
+
+// FileMap represents a source to target file mapping
+type FileMap struct {
+	Source string
+	Target string
+}
+
+// FilesWithCommonParent creates file mappings with a common parent path
 func FilesWithCommonParent(filepaths ...string) []FileMap {
 	var fileMaps []FileMap
 	common := fileutils.CommonParentPath(filepaths)
diff --git a/internal/archiver/archiver_test.go b/internal/archiver/archiver_test.go
new file mode 100644
index 0000000000..3dd595154e
--- /dev/null
+++ b/internal/archiver/archiver_test.go
@@ -0,0 +1,475 @@
+package archiver
+
+import (
+	"archive/tar"
+	"archive/zip"
+	"bytes"
+	"compress/gzip"
+	"io"
+	"strings"
+	"testing"
+)
+
+func TestSanitizeArchivePath(t *testing.T) {
+	tests := []struct {
+		name        string
+		input       string
+		expectError bool
+		expected    string
+	}{
+		// Valid paths
+		{
+			name:        "valid simple file",
+			input:       "file.txt",
+			expectError: false,
+			expected:    "file.txt",
+		},
+		{
+			name:        "valid nested file",
+			input:       "dir/subdir/file.txt",
+			expectError: false,
+			expected:    "dir/subdir/file.txt",
+		},
+		{
+			name:        "valid file with dots in name",
+			input:       "file.backup.txt",
+			expectError: false,
+			expected:    "file.backup.txt",
+		},
+		{
+			name:        "valid directory",
+			input:       "dir/subdir/",
+			expectError: false,
+			expected:    "dir/subdir",
+		},
+
+		// Path traversal attacks
+		{
+			name:        "parent directory traversal",
+			input:       "../file.txt",
+			expectError: true,
+		},
+		{
+			name:        "multiple parent directory traversal",
+			input:       "../../file.txt",
+			expectError: true,
+		},
+		{
+			name:        "parent traversal in middle",
+			input:       "dir/../file.txt",
+			expectError: true,
+		},
+		{
+			name:        "parent traversal at end",
+			input:       "dir/..",
+			expectError: true,
+		},
+		{
+			name:        "parent traversal with file",
+			input:       "dir/../other/file.txt",
+			expectError: true,
+		},
+
+		// Absolute paths
+		{
+			name:        "absolute path unix",
+			input:       "/etc/passwd",
+			expectError: true,
+		},
+		{
+			name:        "absolute path windows",
+			input:       "C:\\Windows\\System32",
+			expectError: true,
+		},
+
+		// Paths starting with separators
+		{
+			name:        "path starting with separator",
+			input:       "/file.txt",
+			expectError: true,
+		},
+		{
+			name:        "path starting with backslash",
+			input:       "\\file.txt",
+			expectError: true,
+		},
+
+		// Empty and invalid paths
+		{
+			name:        "empty path",
+			input:       "",
+			expectError: true,
+		},
+		{
+			name:        "current directory",
+			input:       ".",
+			expectError: false,
+			expected:    ".",
+		},
+		{
+			name:        "root directory",
+			input:       "/",
+			expectError: false,
+			expected:    "/",
+		},
+
+		// Edge cases
+		{
+			name:        "path with only dots",
+			input:       "...",
+			expectError: false,
+			expected:    "...",
+		},
+		{
+			name:        "path with mixed separators",
+			input:       "dir\\subdir/file.txt",
+			expectError: false,
+			expected:    "dir/subdir/file.txt",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := sanitizeArchivePath(tt.input)
+
+			if tt.expectError {
+				if err == nil {
+					t.Errorf("expected error for input %q, but got none", tt.input)
+				}
+				return
+			}
+
+			if err != nil {
+				t.Errorf("unexpected error for input %q: %v", tt.input, err)
+				return
+			}
+
+			if result != tt.expected {
+				t.Errorf("expected %q, got %q", tt.expected, result)
+			}
+		})
+	}
+}
+
+func TestFile_Name(t *testing.T) {
+	tests := []struct {
+		name     string
+		header   interface{}
+		expected string
+	}{
+		{
+			name: "valid tar header",
+			header: &tar.Header{
+				Name: "dir/file.txt",
+			},
+			expected: "file.txt",
+		},
+		{
+			name: "valid zip header",
+			header: zip.FileHeader{
+				Name: "dir/subdir/file.txt",
+			},
+			expected: "file.txt",
+		},
+		{
+			name: "malicious tar header with path traversal",
+			header: &tar.Header{
+				Name: "../etc/passwd",
+			},
+			expected: "invalid_path",
+		},
+		{
+			name: "malicious zip header with path traversal",
+			header: zip.FileHeader{
+				Name: "../../../etc/passwd",
+			},
+			expected: "invalid_path",
+		},
+		{
+			name:     "unknown header type",
+			header:   "not a header",
+			expected: "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			file := File{
+				Header: tt.header,
+			}
+
+			result := file.Name()
+			if result != tt.expected {
+				t.Errorf("expected %q, got %q", tt.expected, result)
+			}
+		})
+	}
+}
+
+func TestFile_FullPath(t *testing.T) {
+	tests := []struct {
+		name        string
+		header      interface{}
+		expectError bool
+		expected    string
+	}{
+		{
+			name: "valid tar header",
+			header: &tar.Header{
+				Name: "dir/file.txt",
+			},
+			expectError: false,
+			expected:    "dir/file.txt",
+		},
+		{
+			name: "valid zip header",
+			header: zip.FileHeader{
+				Name: "dir/subdir/file.txt",
+			},
+			expectError: false,
+			expected:    "dir/subdir/file.txt",
+		},
+		{
+			name: "malicious tar header with path traversal",
+			header: &tar.Header{
+				Name: "../etc/passwd",
+			},
+			expectError: true,
+		},
+		{
+			name: "malicious zip header with path traversal",
+			header: zip.FileHeader{
+				Name: "../../../etc/passwd",
+			},
+			expectError: true,
+		},
+		{
+			name:        "unknown header type",
+			header:      "not a header",
+			expectError: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			file := File{
+				Header: tt.header,
+			}
+
+			result, err := file.FullPath()
+
+			if tt.expectError {
+				if err == nil {
+					t.Errorf("expected error, but got none")
+				}
+				return
+			}
+
+			if err != nil {
+				t.Errorf("unexpected error: %v", err)
+				return
+			}
+
+			if result != tt.expected {
+				t.Errorf("expected %q, got %q", tt.expected, result)
+			}
+		})
+	}
+}
+
+func TestZip_Read_WithMaliciousEntries(t *testing.T) {
+	var buf bytes.Buffer
+	zipWriter := zip.NewWriter(&buf)
+
+	// Add a valid file
+	validFile, err := zipWriter.Create("valid/file.txt")
+	if err != nil {
+		t.Fatalf("failed to create valid file: %v", err)
+	}
+	validFile.Write([]byte("valid content"))
+
+	maliciousFile, err := zipWriter.Create("../etc/passwd")
+	if err != nil {
+		t.Fatalf("failed to create malicious file: %v", err)
+	}
+	maliciousFile.Write([]byte("malicious content"))
+
+	maliciousFile2, err := zipWriter.Create("dir/../../sensitive.txt")
+	if err != nil {
+		t.Fatalf("failed to create malicious file 2: %v", err)
+	}
+	maliciousFile2.Write([]byte("more malicious content"))
+
+	zipWriter.Close()
+
+	zipReader := NewZip()
+	err = zipReader.Open(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
+	if err != nil {
+		t.Fatalf("failed to open zip: %v", err)
+	}
+	defer zipReader.Close()
+
+	file, err := zipReader.Read()
+	if err != nil {
+		t.Fatalf("failed to read first file: %v", err)
+	}
+
+	if file.Name() != "file.txt" {
+		t.Errorf("expected first file to be 'file.txt', got %q", file.Name())
+	}
+
+	_, err = zipReader.Read()
+	if err == nil {
+		t.Error("expected error when reading malicious file, but got none")
+	}
+	if !strings.Contains(err.Error(), "invalid file path in archive") {
+		t.Errorf("expected path validation error, got: %v", err)
+	}
+
+	_, err = zipReader.Read()
+	if err == nil {
+		t.Error("expected error when reading second malicious file, but got none")
+	}
+	if !strings.Contains(err.Error(), "invalid file path in archive") {
+		t.Errorf("expected path validation error, got: %v", err)
+	}
+}
+
+func TestTarGz_Read_WithMaliciousEntries(t *testing.T) {
+	var buf bytes.Buffer
+
+	var tarBuf bytes.Buffer
+	tarWriter := tar.NewWriter(&tarBuf)
+
+	validHeader := &tar.Header{
+		Name: "valid/file.txt",
+		Size: 12,
+		Mode: 0644,
+	}
+	tarWriter.WriteHeader(validHeader)
+	tarWriter.Write([]byte("valid content"))
+
+	maliciousHeader := &tar.Header{
+		Name: "../etc/passwd",
+		Size: 16,
+		Mode: 0644,
+	}
+	tarWriter.WriteHeader(maliciousHeader)
+	tarWriter.Write([]byte("malicious content"))
+
+	tarWriter.Close()
+
+	gzWriter := gzip.NewWriter(&buf)
+	gzWriter.Write(tarBuf.Bytes())
+	gzWriter.Close()
+
+	tarGzReader := NewTarGz()
+	err := tarGzReader.Open(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
+	if err != nil {
+		t.Fatalf("failed to open tar: %v", err)
+	}
+	defer tarGzReader.Close()
+
+	file, err := tarGzReader.Read()
+	if err != nil {
+		t.Fatalf("failed to read first file: %v", err)
+	}
+
+	if file.Name() != "file.txt" {
+		t.Errorf("expected first file to be 'file.txt', got %q", file.Name())
+	}
+
+	_, err = tarGzReader.Read()
+	if err == nil {
+		t.Error("expected error when reading malicious file, but got none")
+	}
+	if !strings.Contains(err.Error(), "invalid file path in archive") {
+		t.Errorf("expected path validation error, got: %v", err)
+	}
+}
+
+func TestZip_Read_ValidEntries(t *testing.T) {
+	var buf bytes.Buffer
+	zipWriter := zip.NewWriter(&buf)
+
+	files := []string{
+		"file1.txt",
+		"dir/file2.txt",
+		"dir/subdir/file3.txt",
+		"file.backup.txt",
+	}
+
+	for _, filename := range files {
+		file, err := zipWriter.Create(filename)
+		if err != nil {
+			t.Fatalf("failed to create file %s: %v", filename, err)
+		}
+		file.Write([]byte("content for " + filename))
+	}
+
+	zipWriter.Close()
+
+	zipReader := NewZip()
+	err := zipReader.Open(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
+	if err != nil {
+		t.Fatalf("failed to open zip: %v", err)
+	}
+	defer zipReader.Close()
+
+	readFiles := make([]string, 0, len(files))
+	for {
+		file, err := zipReader.Read()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			t.Fatalf("failed to read file: %v", err)
+		}
+		readFiles = append(readFiles, file.Name())
+	}
+
+	expectedFiles := []string{"file1.txt", "file2.txt", "file3.txt", "file.backup.txt"}
+	if len(readFiles) != len(expectedFiles) {
+		t.Errorf("expected %d files, got %d", len(expectedFiles), len(readFiles))
+	}
+
+	for _, expected := range expectedFiles {
+		found := false
+		for _, actual := range readFiles {
+			if actual == expected {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Errorf("expected file %s not found in read files", expected)
+		}
+	}
+}
+
+func TestSanitizeArchivePath_EdgeCases(t *testing.T) {
+	edgeCases := []struct {
+		name        string
+		input       string
+		expectError bool
+	}{
+		{"unicode path", "файл.txt", false},
+		{"path with spaces", "file with spaces.txt", false},
+		{"path with special chars", "file!@#$%^&*().txt", false},
+		{"very long path", strings.Repeat("dir/", 100) + "file.txt", false},
+		{"path with null bytes", "file\x00.txt", true},        // null bytes should be rejected
+		{"path with control chars", "file\x01\x02.txt", true}, // control chars should be rejected
+	}
+
+	for _, tc := range edgeCases {
+		t.Run(tc.name, func(t *testing.T) {
+			_, err := sanitizeArchivePath(tc.input)
+			if tc.expectError && err == nil {
+				t.Errorf("expected error for %q, but got none", tc.input)
+			}
+			if !tc.expectError && err != nil {
+				t.Errorf("unexpected error for %q: %v", tc.input, err)
+			}
+		})
+	}
+}
diff --git a/internal/unarchiver/targz.go b/internal/unarchiver/targz.go
index cd1ca21d23..d74f69b5e1 100644
--- a/internal/unarchiver/targz.go
+++ b/internal/unarchiver/targz.go
@@ -8,7 +8,7 @@ import (
 	"os"
 	"path/filepath"
 
-	"github.com/mholt/archiver/v3"
+	"github.com/ActiveState/cli/internal/archiver"
 )
 
 /*
@@ -73,7 +73,14 @@ func (ar *TarGzArchive) ExtractNext(destination string) (f archiver.File, err er
 	if !ok {
 		return f, fmt.Errorf("expected header to be *tar.Header but was %T", f.Header)
 	}
-	return f, untarSingleFile(header, f, destination, header.Name, ar.OverwriteExisting)
+
+	// Use the sanitized path from the File object instead of raw header.Name
+	sanitizedPath, err := f.FullPath()
+	if err != nil {
+		return f, fmt.Errorf("invalid file path: %w", err)
+	}
+
+	return f, untarSingleFile(header, f, destination, sanitizedPath, ar.OverwriteExisting)
 }
 
 func untarSingleFile(hdr *tar.Header, data io.Reader, destination, relTo string, overwriteExisting bool) error {
diff --git a/internal/unarchiver/unarchiver.go b/internal/unarchiver/unarchiver.go
index b90019c1f5..6e6e974ce9 100644
--- a/internal/unarchiver/unarchiver.go
+++ b/internal/unarchiver/unarchiver.go
@@ -9,8 +9,8 @@ import (
 	"path/filepath"
 	"runtime"
 
+	"github.com/ActiveState/cli/internal/archiver"
 	"github.com/ActiveState/cli/internal/errs"
-	"github.com/mholt/archiver/v3"
 
 	"github.com/ActiveState/cli/internal/fileutils"
 )
diff --git a/internal/unarchiver/zip.go b/internal/unarchiver/zip.go
index 114a0bb61a..29403c93ed 100644
--- a/internal/unarchiver/zip.go
+++ b/internal/unarchiver/zip.go
@@ -1,11 +1,11 @@
 package unarchiver
 
 import (
+	"archive/zip"
 	"fmt"
 	"path/filepath"
 
-	"github.com/klauspost/compress/zip"
-	"github.com/mholt/archiver/v3"
+	"github.com/ActiveState/cli/internal/archiver"
 )
 
 /*
@@ -38,11 +38,20 @@ func (z *ZipArchive) ExtractNext(destination string) (f archiver.File, err error
 		return f, err // don't wrap error; calling loop must break on io.EOF
 	}
 	defer f.Close()
-	header, ok := f.Header.(zip.FileHeader)
+
+	// Validate that we have a zip header
+	_, ok := f.Header.(zip.FileHeader)
 	if !ok {
 		return f, fmt.Errorf("expected header to be zip.FileHeader but was %T", f.Header)
 	}
-	return f, z.extractFile(f, filepath.Join(destination, header.Name))
+
+	// Use the sanitized path from the File object instead of raw header.Name
+	sanitizedPath, err := f.FullPath()
+	if err != nil {
+		return f, fmt.Errorf("invalid file path: %w", err)
+	}
+
+	return f, z.extractFile(f, filepath.Join(destination, sanitizedPath))
 }
 
 func (z *ZipArchive) extractFile(f archiver.File, to string) error {
diff --git a/scripts/ci/update-generator/main.go b/scripts/ci/update-generator/main.go
index cd173c49e8..0a67454874 100644
--- a/scripts/ci/update-generator/main.go
+++ b/scripts/ci/update-generator/main.go
@@ -11,7 +11,7 @@ import (
 	"path/filepath"
 	"runtime"
 
-	"github.com/mholt/archiver/v3"
+	"github.com/ActiveState/cli/internal/archiver"
 
 	"github.com/ActiveState/cli/internal/condition"
 	"github.com/ActiveState/cli/internal/constants"
diff --git a/test/integration/checkout_int_test.go b/test/integration/checkout_int_test.go
index 75f033144e..249eab60cb 100644
--- a/test/integration/checkout_int_test.go
+++ b/test/integration/checkout_int_test.go
@@ -9,7 +9,7 @@ import (
 	"strings"
 	"testing"
 
-	"github.com/mholt/archiver/v3"
+	"github.com/ActiveState/cli/internal/archiver"
 
 	"github.com/ActiveState/cli/internal/constants"
 	"github.com/ActiveState/cli/internal/environment"
diff --git a/vendor/github.com/STARRY-S/zip/.gitignore b/vendor/github.com/STARRY-S/zip/.gitignore
new file mode 100644
index 0000000000..3fa4527a66
--- /dev/null
+++ b/vendor/github.com/STARRY-S/zip/.gitignore
@@ -0,0 +1,17 @@
+# Generated files
+*.out
+*.converted
+*.txt
+!NOTICE.txt
+tmp*
+
+# Test archive files
+/*.zip
+!testdata/*.zip
+
+# VSCode config
+/.vscode/
+
+# macOS trash
+.DS_Store
+._.DS_Store
diff --git a/vendor/github.com/STARRY-S/zip/LICENSE b/vendor/github.com/STARRY-S/zip/LICENSE
new file mode 100644
index 0000000000..7bb02ab28a
--- /dev/null
+++ b/vendor/github.com/STARRY-S/zip/LICENSE
@@ -0,0 +1,28 @@
+BSD 3-Clause License
+
+Copyright (c) 2023, Starry
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/STARRY-S/zip/README.md b/vendor/github.com/STARRY-S/zip/README.md
new file mode 100644
index 0000000000..c959850f74
--- /dev/null
+++ b/vendor/github.com/STARRY-S/zip/README.md
@@ -0,0 +1,41 @@
+Go zip library
+==============
+
+This project is based on the [archive/zip](https://github.com/golang/go/tree/master/src/archive/zip) Go standard library. It adds a new `Updater` struct that allows appending new files to the existing zip archive without having to decompress the whole file, and allows overwriting of files already stored in the zip archive.
+
+Usage
+-----
+
+```go
+import "github.com/STARRY-S/zip"
+```
+
+```go
+// Open an existing test.zip archive with read/write only mode for Updater.
+f, err := os.OpenFile("test.zip", os.O_RDWR, 0)
+handleErr(err)
+zu, err := zip.NewUpdater(f)
+handleErr(err)
+defer zu.Close()
+
+// Updater supports modify the zip comment.
+err = zu.SetComment("Test update zip archive")
+handleErr(err)
+
+// Append a new file into existing archive.
+// The Append method will create a new io.Writer.
+w, err := zu.Append("example.txt")
+handleErr(err)
+// Write data into writer.
+_, err = w.Write([]byte("hello world"))
+handleErr(err)
+```
+
+Example test code: [updater_example_test.go](./updater_example_test.go).
+
+License
+-------
+
+[BSD 3-Clause](LICENSE)
+
+The zip library is based on [Go standard library](https://github.com/golang/go).
diff --git a/vendor/github.com/STARRY-S/zip/reader.go b/vendor/github.com/STARRY-S/zip/reader.go
new file mode 100644
index 0000000000..081ba94ca0
--- /dev/null
+++ b/vendor/github.com/STARRY-S/zip/reader.go
@@ -0,0 +1,978 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package zip
+
+import (
+	"bufio"
+	"encoding/binary"
+	"errors"
+	"hash"
+	"hash/crc32"
+	"io"
+	"io/fs"
+	"os"
+	"path"
+	"path/filepath"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+)
+
+// var zipinsecurepath = godebug.New("zipinsecurepath")
+
+var (
+	ErrFormat       = errors.New("zip: not a valid zip file")
+	ErrAlgorithm    = errors.New("zip: unsupported compression algorithm")
+	ErrChecksum     = errors.New("zip: checksum error")
+	ErrInsecurePath = errors.New("zip: insecure file path")
+)
+
+// A Reader serves content from a ZIP archive.
+type Reader struct {
+	r             io.ReaderAt
+	File          []*File
+	Comment       string
+	decompressors map[uint16]Decompressor
+
+	// Some JAR files are zip files with a prefix that is a bash script.
+	// The baseOffset field is the start of the zip file proper.
+	baseOffset int64
+
+	// fileList is a list of files sorted by ename,
+	// for use by the Open method.
+	fileListOnce sync.Once
+	fileList     []fileListEntry
+}
+
+// A ReadCloser is a [Reader] that must be closed when no longer needed.
+type ReadCloser struct {
+	f *os.File
+	Reader
+}
+
+// A File is a single file in a ZIP archive.
+// The file information is in the embedded [FileHeader].
+// The file content can be accessed by calling [File.Open].
+type File struct {
+	FileHeader
+	zip          *Reader
+	zipr         io.ReaderAt
+	headerOffset int64 // includes overall ZIP archive baseOffset
+	zip64        bool  // zip64 extended information extra field presence
+}
+
+// OpenReader will open the Zip file specified by name and return a ReadCloser.
+//
+// If any file inside the archive uses a non-local name
+// (as defined by [filepath.IsLocal]) or a name containing backslashes
+// and the GODEBUG environment variable contains `zipinsecurepath=0`,
+// OpenReader returns the reader with an ErrInsecurePath error.
+// A future version of Go may introduce this behavior by default.
+// Programs that want to accept non-local names can ignore
+// the ErrInsecurePath error and use the returned reader.
+func OpenReader(name string) (*ReadCloser, error) {
+	f, err := os.Open(name)
+	if err != nil {
+		return nil, err
+	}
+	fi, err := f.Stat()
+	if err != nil {
+		f.Close()
+		return nil, err
+	}
+	r := new(ReadCloser)
+	if err = r.init(f, fi.Size()); err != nil && err != ErrInsecurePath {
+		f.Close()
+		return nil, err
+	}
+	r.f = f
+	return r, err
+}
+
+// NewReader returns a new [Reader] reading from r, which is assumed to
+// have the given size in bytes.
+//
+// If any file inside the archive uses a non-local name
+// (as defined by [filepath.IsLocal]) or a name containing backslashes
+// and the GODEBUG environment variable contains `zipinsecurepath=0`,
+// NewReader returns the reader with an [ErrInsecurePath] error.
+// A future version of Go may introduce this behavior by default.
+// Programs that want to accept non-local names can ignore
+// the [ErrInsecurePath] error and use the returned reader.
+func NewReader(r io.ReaderAt, size int64) (*Reader, error) {
+	if size < 0 {
+		return nil, errors.New("zip: size cannot be negative")
+	}
+	zr := new(Reader)
+	var err error
+	if err = zr.init(r, size); err != nil && err != ErrInsecurePath {
+		return nil, err
+	}
+	return zr, err
+}
+
+func (r *Reader) init(rdr io.ReaderAt, size int64) error {
+	end, baseOffset, err := readDirectoryEnd(rdr, size)
+	if err != nil {
+		return err
+	}
+	r.r = rdr
+	r.baseOffset = baseOffset
+	// Since the number of directory records is not validated, it is not
+	// safe to preallocate r.File without first checking that the specified
+	// number of files is reasonable, since a malformed archive may
+	// indicate it contains up to 1 << 128 - 1 files. Since each file has a
+	// header which will be _at least_ 30 bytes we can safely preallocate
+	// if (data size / 30) >= end.directoryRecords.
+	if end.directorySize < uint64(size) && (uint64(size)-end.directorySize)/30 >= end.directoryRecords {
+		r.File = make([]*File, 0, end.directoryRecords)
+	}
+	r.Comment = end.comment
+	rs := io.NewSectionReader(rdr, 0, size)
+	if _, err = rs.Seek(r.baseOffset+int64(end.directoryOffset), io.SeekStart); err != nil {
+		return err
+	}
+	buf := bufio.NewReader(rs)
+
+	// The count of files inside a zip is truncated to fit in a uint16.
+	// Gloss over this by reading headers until we encounter
+	// a bad one, and then only report an ErrFormat or UnexpectedEOF if
+	// the file count modulo 65536 is incorrect.
+	for {
+		f := &File{zip: r, zipr: rdr}
+		err = readDirectoryHeader(f, buf)
+		if err == ErrFormat || err == io.ErrUnexpectedEOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+		f.headerOffset += r.baseOffset
+		r.File = append(r.File, f)
+	}
+	if uint16(len(r.File)) != uint16(end.directoryRecords) { // only compare 16 bits here
+		// Return the readDirectoryHeader error if we read
+		// the wrong number of directory entries.
+		return err
+	}
+	if os.Getenv("GODEBUG") == "zipinsecurepath=0" {
+		for _, f := range r.File {
+			if f.Name == "" {
+				// Zip permits an empty file name field.
+				continue
+			}
+			// The zip specification states that names must use forward slashes,
+			// so consider any backslashes in the name insecure.
+			if !filepath.IsLocal(f.Name) || strings.Contains(f.Name, "\\") {
+				// zipinsecurepath.IncNonDefault()
+				return ErrInsecurePath
+			}
+		}
+	}
+	return nil
+}
+
+// RegisterDecompressor registers or overrides a custom decompressor for a
+// specific method ID. If a decompressor for a given method is not found,
+// [Reader] will default to looking up the decompressor at the package level.
+func (r *Reader) RegisterDecompressor(method uint16, dcomp Decompressor) {
+	if r.decompressors == nil {
+		r.decompressors = make(map[uint16]Decompressor)
+	}
+	r.decompressors[method] = dcomp
+}
+
+func (r *Reader) decompressor(method uint16) Decompressor {
+	dcomp := r.decompressors[method]
+	if dcomp == nil {
+		dcomp = decompressor(method)
+	}
+	return dcomp
+}
+
+// Close closes the Zip file, rendering it unusable for I/O.
+func (rc *ReadCloser) Close() error {
+	return rc.f.Close()
+}
+
+// DataOffset returns the offset of the file's possibly-compressed
+// data, relative to the beginning of the zip file.
+//
+// Most callers should instead use [File.Open], which transparently
+// decompresses data and verifies checksums.
+func (f *File) DataOffset() (offset int64, err error) {
+	bodyOffset, err := f.findBodyOffset()
+	if err != nil {
+		return
+	}
+	return f.headerOffset + bodyOffset, nil
+}
+
+// Open returns a [ReadCloser] that provides access to the [File]'s contents.
+// Multiple files may be read concurrently.
+func (f *File) Open() (io.ReadCloser, error) {
+	bodyOffset, err := f.findBodyOffset()
+	if err != nil {
+		return nil, err
+	}
+	if strings.HasSuffix(f.Name, "/") {
+		// The ZIP specification (APPNOTE.TXT) specifies that directories, which
+		// are technically zero-byte files, must not have any associated file
+		// data. We previously tried failing here if f.CompressedSize64 != 0,
+		// but it turns out that a number of implementations (namely, the Java
+		// jar tool) don't properly set the storage method on directories
+		// resulting in a file with compressed size > 0 but uncompressed size ==
+		// 0. We still want to fail when a directory has associated uncompressed
+		// data, but we are tolerant of cases where the uncompressed size is
+		// zero but compressed size is not.
+		if f.UncompressedSize64 != 0 {
+			return &dirReader{ErrFormat}, nil
+		} else {
+			return &dirReader{io.EOF}, nil
+		}
+	}
+	size := int64(f.CompressedSize64)
+	r := io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset, size)
+	dcomp := f.zip.decompressor(f.Method)
+	if dcomp == nil {
+		return nil, ErrAlgorithm
+	}
+	var rc io.ReadCloser = dcomp(r)
+	var desr io.Reader
+	if f.hasDataDescriptor() {
+		desr = io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset+size, dataDescriptorLen)
+	}
+	rc = &checksumReader{
+		rc:   rc,
+		hash: crc32.NewIEEE(),
+		f:    f,
+		desr: desr,
+	}
+	return rc, nil
+}
+
+// OpenRaw returns a [Reader] that provides access to the [File]'s contents without
+// decompression.
+func (f *File) OpenRaw() (io.Reader, error) {
+	bodyOffset, err := f.findBodyOffset()
+	if err != nil {
+		return nil, err
+	}
+	r := io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset, int64(f.CompressedSize64))
+	return r, nil
+}
+
+type dirReader struct {
+	err error
+}
+
+func (r *dirReader) Read([]byte) (int, error) {
+	return 0, r.err
+}
+
+func (r *dirReader) Close() error {
+	return nil
+}
+
+type checksumReader struct {
+	rc    io.ReadCloser
+	hash  hash.Hash32
+	nread uint64 // number of bytes read so far
+	f     *File
+	desr  io.Reader // if non-nil, where to read the data descriptor
+	err   error     // sticky error
+}
+
+func (r *checksumReader) Stat() (fs.FileInfo, error) {
+	return headerFileInfo{&r.f.FileHeader}, nil
+}
+
+func (r *checksumReader) Read(b []byte) (n int, err error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	n, err = r.rc.Read(b)
+	r.hash.Write(b[:n])
+	r.nread += uint64(n)
+	if r.nread > r.f.UncompressedSize64 {
+		return 0, ErrFormat
+	}
+	if err == nil {
+		return
+	}
+	if err == io.EOF {
+		if r.nread != r.f.UncompressedSize64 {
+			return 0, io.ErrUnexpectedEOF
+		}
+		if r.desr != nil {
+			if err1 := readDataDescriptor(r.desr, r.f); err1 != nil {
+				if err1 == io.EOF {
+					err = io.ErrUnexpectedEOF
+				} else {
+					err = err1
+				}
+			} else if r.hash.Sum32() != r.f.CRC32 {
+				err = ErrChecksum
+			}
+		} else {
+			// If there's not a data descriptor, we still compare
+			// the CRC32 of what we've read against the file header
+			// or TOC's CRC32, if it seems like it was set.
+			if r.f.CRC32 != 0 && r.hash.Sum32() != r.f.CRC32 {
+				err = ErrChecksum
+			}
+		}
+	}
+	r.err = err
+	return
+}
+
+func (r *checksumReader) Close() error { return r.rc.Close() }
+
+// findBodyOffset does the minimum work to verify the file has a header
+// and returns the file body offset.
+func (f *File) findBodyOffset() (int64, error) {
+	var buf [fileHeaderLen]byte
+	if _, err := f.zipr.ReadAt(buf[:], f.headerOffset); err != nil {
+		return 0, err
+	}
+	b := readBuf(buf[:])
+	if sig := b.uint32(); sig != fileHeaderSignature {
+		return 0, ErrFormat
+	}
+	b = b[22:] // skip over most of the header
+	filenameLen := int(b.uint16())
+	extraLen := int(b.uint16())
+	return int64(fileHeaderLen + filenameLen + extraLen), nil
+}
+
+// readDirectoryHeader attempts to read a directory header from r.
+// It returns io.ErrUnexpectedEOF if it cannot read a complete header,
+// and ErrFormat if it doesn't find a valid header signature.
+func readDirectoryHeader(f *File, r io.Reader) error {
+	var buf [directoryHeaderLen]byte
+	if _, err := io.ReadFull(r, buf[:]); err != nil {
+		return err
+	}
+	b := readBuf(buf[:])
+	if sig := b.uint32(); sig != directoryHeaderSignature {
+		return ErrFormat
+	}
+	f.CreatorVersion = b.uint16()
+	f.ReaderVersion = b.uint16()
+	f.Flags = b.uint16()
+	f.Method = b.uint16()
+	f.ModifiedTime = b.uint16()
+	f.ModifiedDate = b.uint16()
+	f.CRC32 = b.uint32()
+	f.CompressedSize = b.uint32()
+	f.UncompressedSize = b.uint32()
+	f.CompressedSize64 = uint64(f.CompressedSize)
+	f.UncompressedSize64 = uint64(f.UncompressedSize)
+	filenameLen := int(b.uint16())
+	extraLen := int(b.uint16())
+	commentLen := int(b.uint16())
+	b = b[4:] // skipped start disk number and internal attributes (2x uint16)
+	f.ExternalAttrs = b.uint32()
+	f.headerOffset = int64(b.uint32())
+	d := make([]byte, filenameLen+extraLen+commentLen)
+	if _, err := io.ReadFull(r, d); err != nil {
+		return err
+	}
+	f.Name = string(d[:filenameLen])
+	f.Extra = d[filenameLen : filenameLen+extraLen]
+	f.Comment = string(d[filenameLen+extraLen:])
+
+	// Determine the character encoding.
+	utf8Valid1, utf8Require1 := detectUTF8(f.Name)
+	utf8Valid2, utf8Require2 := detectUTF8(f.Comment)
+	switch {
+	case !utf8Valid1 || !utf8Valid2:
+		// Name and Comment definitely not UTF-8.
+		f.NonUTF8 = true
+	case !utf8Require1 && !utf8Require2:
+		// Name and Comment use only single-byte runes that overlap with UTF-8.
+		f.NonUTF8 = false
+	default:
+		// Might be UTF-8, might be some other encoding; preserve existing flag.
+		// Some ZIP writers use UTF-8 encoding without setting the UTF-8 flag.
+		// Since it is impossible to always distinguish valid UTF-8 from some
+		// other encoding (e.g., GBK or Shift-JIS), we trust the flag.
+		f.NonUTF8 = f.Flags&0x800 == 0
+	}
+
+	needUSize := f.UncompressedSize == ^uint32(0)
+	needCSize := f.CompressedSize == ^uint32(0)
+	needHeaderOffset := f.headerOffset == int64(^uint32(0))
+
+	// Best effort to find what we need.
+	// Other zip authors might not even follow the basic format,
+	// and we'll just ignore the Extra content in that case.
+	var modified time.Time
+parseExtras:
+	for extra := readBuf(f.Extra); len(extra) >= 4; { // need at least tag and size
+		fieldTag := extra.uint16()
+		fieldSize := int(extra.uint16())
+		if len(extra) < fieldSize {
+			break
+		}
+		fieldBuf := extra.sub(fieldSize)
+
+		switch fieldTag {
+		case zip64ExtraID:
+			f.zip64 = true
+
+			// update directory values from the zip64 extra block.
+			// They should only be consulted if the sizes read earlier
+			// are maxed out.
+			// See golang.org/issue/13367.
+			if needUSize {
+				needUSize = false
+				if len(fieldBuf) < 8 {
+					return ErrFormat
+				}
+				f.UncompressedSize64 = fieldBuf.uint64()
+			}
+			if needCSize {
+				needCSize = false
+				if len(fieldBuf) < 8 {
+					return ErrFormat
+				}
+				f.CompressedSize64 = fieldBuf.uint64()
+			}
+			if needHeaderOffset {
+				needHeaderOffset = false
+				if len(fieldBuf) < 8 {
+					return ErrFormat
+				}
+				f.headerOffset = int64(fieldBuf.uint64())
+			}
+		case ntfsExtraID:
+			if len(fieldBuf) < 4 {
+				continue parseExtras
+			}
+			fieldBuf.uint32()        // reserved (ignored)
+			for len(fieldBuf) >= 4 { // need at least tag and size
+				attrTag := fieldBuf.uint16()
+				attrSize := int(fieldBuf.uint16())
+				if len(fieldBuf) < attrSize {
+					continue parseExtras
+				}
+				attrBuf := fieldBuf.sub(attrSize)
+				if attrTag != 1 || attrSize != 24 {
+					continue // Ignore irrelevant attributes
+				}
+
+				const ticksPerSecond = 1e7    // Windows timestamp resolution
+				ts := int64(attrBuf.uint64()) // ModTime since Windows epoch
+				secs := ts / ticksPerSecond
+				nsecs := (1e9 / ticksPerSecond) * (ts % ticksPerSecond)
+				epoch := time.Date(1601, time.January, 1, 0, 0, 0, 0, time.UTC)
+				modified = time.Unix(epoch.Unix()+secs, nsecs)
+			}
+		case unixExtraID, infoZipUnixExtraID:
+			if len(fieldBuf) < 8 {
+				continue parseExtras
+			}
+			fieldBuf.uint32()              // AcTime (ignored)
+			ts := int64(fieldBuf.uint32()) // ModTime since Unix epoch
+			modified = time.Unix(ts, 0)
+		case extTimeExtraID:
+			if len(fieldBuf) < 5 || fieldBuf.uint8()&1 == 0 {
+				continue parseExtras
+			}
+			ts := int64(fieldBuf.uint32()) // ModTime since Unix epoch
+			modified = time.Unix(ts, 0)
+		}
+	}
+
+	msdosModified := msDosTimeToTime(f.ModifiedDate, f.ModifiedTime)
+	f.Modified = msdosModified
+	if !modified.IsZero() {
+		f.Modified = modified.UTC()
+
+		// If legacy MS-DOS timestamps are set, we can use the delta between
+		// the legacy and extended versions to estimate timezone offset.
+		//
+		// A non-UTC timezone is always used (even if offset is zero).
+		// Thus, FileHeader.Modified.Location() == time.UTC is useful for
+		// determining whether extended timestamps are present.
+		// This is necessary for users that need to do additional time
+		// calculations when dealing with legacy ZIP formats.
+		if f.ModifiedTime != 0 || f.ModifiedDate != 0 {
+			f.Modified = modified.In(timeZone(msdosModified.Sub(modified)))
+		}
+	}
+
+	// Assume that uncompressed size 2³²-1 could plausibly happen in
+	// an old zip32 file that was sharding inputs into the largest chunks
+	// possible (or is just malicious; search the web for 42.zip).
+	// If needUSize is true still, it means we didn't see a zip64 extension.
+	// As long as the compressed size is not also 2³²-1 (implausible)
+	// and the header is not also 2³²-1 (equally implausible),
+	// accept the uncompressed size 2³²-1 as valid.
+	// If nothing else, this keeps archive/zip working with 42.zip.
+	_ = needUSize
+
+	if needCSize || needHeaderOffset {
+		return ErrFormat
+	}
+
+	return nil
+}
+
+func readDataDescriptor(r io.Reader, f *File) error {
+	var buf [dataDescriptorLen]byte
+	// The spec says: "Although not originally assigned a
+	// signature, the value 0x08074b50 has commonly been adopted
+	// as a signature value for the data descriptor record.
+	// Implementers should be aware that ZIP files may be
+	// encountered with or without this signature marking data
+	// descriptors and should account for either case when reading
+	// ZIP files to ensure compatibility."
+	//
+	// dataDescriptorLen includes the size of the signature but
+	// first read just those 4 bytes to see if it exists.
+	if _, err := io.ReadFull(r, buf[:4]); err != nil {
+		return err
+	}
+	off := 0
+	maybeSig := readBuf(buf[:4])
+	if maybeSig.uint32() != dataDescriptorSignature {
+		// No data descriptor signature. Keep these four
+		// bytes.
+		off += 4
+	}
+	if _, err := io.ReadFull(r, buf[off:12]); err != nil {
+		return err
+	}
+	b := readBuf(buf[:12])
+	if b.uint32() != f.CRC32 {
+		return ErrChecksum
+	}
+
+	// The two sizes that follow here can be either 32 bits or 64 bits
+	// but the spec is not very clear on this and different
+	// interpretations has been made causing incompatibilities. We
+	// already have the sizes from the central directory so we can
+	// just ignore these.
+
+	return nil
+}
+
+func readDirectoryEnd(r io.ReaderAt, size int64) (dir *directoryEnd, baseOffset int64, err error) {
+	// look for directoryEndSignature in the last 1k, then in the last 65k
+	var buf []byte
+	var directoryEndOffset int64
+	for i, bLen := range []int64{1024, 65 * 1024} {
+		if bLen > size {
+			bLen = size
+		}
+		buf = make([]byte, int(bLen))
+		if _, err := r.ReadAt(buf, size-bLen); err != nil && err != io.EOF {
+			return nil, 0, err
+		}
+		if p := findSignatureInBlock(buf); p >= 0 {
+			buf = buf[p:]
+			directoryEndOffset = size - bLen + int64(p)
+			break
+		}
+		if i == 1 || bLen == size {
+			return nil, 0, ErrFormat
+		}
+	}
+
+	// read header into struct
+	b := readBuf(buf[4:]) // skip signature
+	d := &directoryEnd{
+		diskNbr:            uint32(b.uint16()),
+		dirDiskNbr:         uint32(b.uint16()),
+		dirRecordsThisDisk: uint64(b.uint16()),
+		directoryRecords:   uint64(b.uint16()),
+		directorySize:      uint64(b.uint32()),
+		directoryOffset:    uint64(b.uint32()),
+		commentLen:         b.uint16(),
+	}
+	l := int(d.commentLen)
+	if l > len(b) {
+		return nil, 0, errors.New("zip: invalid comment length")
+	}
+	d.comment = string(b[:l])
+
+	// These values mean that the file can be a zip64 file
+	if d.directoryRecords == 0xffff || d.directorySize == 0xffff || d.directoryOffset == 0xffffffff {
+		p, err := findDirectory64End(r, directoryEndOffset)
+		if err == nil && p >= 0 {
+			directoryEndOffset = p
+			err = readDirectory64End(r, p, d)
+		}
+		if err != nil {
+			return nil, 0, err
+		}
+	}
+
+	maxInt64 := uint64(1<<63 - 1)
+	if d.directorySize > maxInt64 || d.directoryOffset > maxInt64 {
+		return nil, 0, ErrFormat
+	}
+
+	baseOffset = directoryEndOffset - int64(d.directorySize) - int64(d.directoryOffset)
+
+	// Make sure directoryOffset points to somewhere in our file.
+	if o := baseOffset + int64(d.directoryOffset); o < 0 || o >= size {
+		return nil, 0, ErrFormat
+	}
+
+	// If the directory end data tells us to use a non-zero baseOffset,
+	// but we would find a valid directory entry if we assume that the
+	// baseOffset is 0, then just use a baseOffset of 0.
+	// We've seen files in which the directory end data gives us
+	// an incorrect baseOffset.
+	if baseOffset > 0 {
+		off := int64(d.directoryOffset)
+		rs := io.NewSectionReader(r, off, size-off)
+		if readDirectoryHeader(&File{}, rs) == nil {
+			baseOffset = 0
+		}
+	}
+
+	return d, baseOffset, nil
+}
+
+// findDirectory64End tries to read the zip64 locator just before the
+// directory end and returns the offset of the zip64 directory end if
+// found.
+func findDirectory64End(r io.ReaderAt, directoryEndOffset int64) (int64, error) {
+	locOffset := directoryEndOffset - directory64LocLen
+	if locOffset < 0 {
+		return -1, nil // no need to look for a header outside the file
+	}
+	buf := make([]byte, directory64LocLen)
+	if _, err := r.ReadAt(buf, locOffset); err != nil {
+		return -1, err
+	}
+	b := readBuf(buf)
+	if sig := b.uint32(); sig != directory64LocSignature {
+		return -1, nil
+	}
+	if b.uint32() != 0 { // number of the disk with the start of the zip64 end of central directory
+		return -1, nil // the file is not a valid zip64-file
+	}
+	p := b.uint64()      // relative offset of the zip64 end of central directory record
+	if b.uint32() != 1 { // total number of disks
+		return -1, nil // the file is not a valid zip64-file
+	}
+	return int64(p), nil
+}
+
+// readDirectory64End reads the zip64 directory end and updates the
+// directory end with the zip64 directory end values.
+func readDirectory64End(r io.ReaderAt, offset int64, d *directoryEnd) (err error) {
+	buf := make([]byte, directory64EndLen)
+	if _, err := r.ReadAt(buf, offset); err != nil {
+		return err
+	}
+
+	b := readBuf(buf)
+	if sig := b.uint32(); sig != directory64EndSignature {
+		return ErrFormat
+	}
+
+	b = b[12:]                        // skip dir size, version and version needed (uint64 + 2x uint16)
+	d.diskNbr = b.uint32()            // number of this disk
+	d.dirDiskNbr = b.uint32()         // number of the disk with the start of the central directory
+	d.dirRecordsThisDisk = b.uint64() // total number of entries in the central directory on this disk
+	d.directoryRecords = b.uint64()   // total number of entries in the central directory
+	d.directorySize = b.uint64()      // size of the central directory
+	d.directoryOffset = b.uint64()    // offset of start of central directory with respect to the starting disk number
+
+	return nil
+}
+
+func findSignatureInBlock(b []byte) int {
+	for i := len(b) - directoryEndLen; i >= 0; i-- {
+		// defined from directoryEndSignature in struct.go
+		if b[i] == 'P' && b[i+1] == 'K' && b[i+2] == 0x05 && b[i+3] == 0x06 {
+			// n is length of comment
+			n := int(b[i+directoryEndLen-2]) | int(b[i+directoryEndLen-1])<<8
+			if n+directoryEndLen+i <= len(b) {
+				return i
+			}
+		}
+	}
+	return -1
+}
+
+type readBuf []byte
+
+func (b *readBuf) uint8() uint8 {
+	v := (*b)[0]
+	*b = (*b)[1:]
+	return v
+}
+
+func (b *readBuf) uint16() uint16 {
+	v := binary.LittleEndian.Uint16(*b)
+	*b = (*b)[2:]
+	return v
+}
+
+func (b *readBuf) uint32() uint32 {
+	v := binary.LittleEndian.Uint32(*b)
+	*b = (*b)[4:]
+	return v
+}
+
+func (b *readBuf) uint64() uint64 {
+	v := binary.LittleEndian.Uint64(*b)
+	*b = (*b)[8:]
+	return v
+}
+
+func (b *readBuf) sub(n int) readBuf {
+	b2 := (*b)[:n]
+	*b = (*b)[n:]
+	return b2
+}
+
+// A fileListEntry is a File and its ename.
+// If file == nil, the fileListEntry describes a directory without metadata.
+type fileListEntry struct {
+	name  string
+	file  *File
+	isDir bool
+	isDup bool
+}
+
+type fileInfoDirEntry interface {
+	fs.FileInfo
+	fs.DirEntry
+}
+
+func (f *fileListEntry) stat() (fileInfoDirEntry, error) {
+	if f.isDup {
+		return nil, errors.New(f.name + ": duplicate entries in zip file")
+	}
+	if !f.isDir {
+		return headerFileInfo{&f.file.FileHeader}, nil
+	}
+	return f, nil
+}
+
+// Only used for directories.
+func (f *fileListEntry) Name() string      { _, elem, _ := split(f.name); return elem }
+func (f *fileListEntry) Size() int64       { return 0 }
+func (f *fileListEntry) Mode() fs.FileMode { return fs.ModeDir | 0555 }
+func (f *fileListEntry) Type() fs.FileMode { return fs.ModeDir }
+func (f *fileListEntry) IsDir() bool       { return true }
+func (f *fileListEntry) Sys() any          { return nil }
+
+func (f *fileListEntry) ModTime() time.Time {
+	if f.file == nil {
+		return time.Time{}
+	}
+	return f.file.FileHeader.Modified.UTC()
+}
+
+func (f *fileListEntry) Info() (fs.FileInfo, error) { return f, nil }
+
+func (f *fileListEntry) String() string {
+	return fs.FormatDirEntry(f)
+}
+
+// toValidName coerces name to be a valid name for fs.FS.Open.
+func toValidName(name string) string {
+	name = strings.ReplaceAll(name, `\`, `/`)
+	p := path.Clean(name)
+
+	p = strings.TrimPrefix(p, "/")
+
+	for strings.HasPrefix(p, "../") {
+		p = p[len("../"):]
+	}
+
+	return p
+}
+
+func (r *Reader) initFileList() {
+	r.fileListOnce.Do(func() {
+		// files and knownDirs map from a file/directory name
+		// to an index into the r.fileList entry that we are
+		// building. They are used to mark duplicate entries.
+		files := make(map[string]int)
+		knownDirs := make(map[string]int)
+
+		// dirs[name] is true if name is known to be a directory,
+		// because it appears as a prefix in a path.
+		dirs := make(map[string]bool)
+
+		for _, file := range r.File {
+			isDir := len(file.Name) > 0 && file.Name[len(file.Name)-1] == '/'
+			name := toValidName(file.Name)
+			if name == "" {
+				continue
+			}
+
+			if idx, ok := files[name]; ok {
+				r.fileList[idx].isDup = true
+				continue
+			}
+			if idx, ok := knownDirs[name]; ok {
+				r.fileList[idx].isDup = true
+				continue
+			}
+
+			for dir := path.Dir(name); dir != "."; dir = path.Dir(dir) {
+				dirs[dir] = true
+			}
+
+			idx := len(r.fileList)
+			entry := fileListEntry{
+				name:  name,
+				file:  file,
+				isDir: isDir,
+			}
+			r.fileList = append(r.fileList, entry)
+			if isDir {
+				knownDirs[name] = idx
+			} else {
+				files[name] = idx
+			}
+		}
+		for dir := range dirs {
+			if _, ok := knownDirs[dir]; !ok {
+				if idx, ok := files[dir]; ok {
+					r.fileList[idx].isDup = true
+				} else {
+					entry := fileListEntry{
+						name:  dir,
+						file:  nil,
+						isDir: true,
+					}
+					r.fileList = append(r.fileList, entry)
+				}
+			}
+		}
+
+		sort.Slice(r.fileList, func(i, j int) bool { return fileEntryLess(r.fileList[i].name, r.fileList[j].name) })
+	})
+}
+
+func fileEntryLess(x, y string) bool {
+	xdir, xelem, _ := split(x)
+	ydir, yelem, _ := split(y)
+	return xdir < ydir || xdir == ydir && xelem < yelem
+}
+
+// Open opens the named file in the ZIP archive,
+// using the semantics of fs.FS.Open:
+// paths are always slash separated, with no
+// leading / or ../ elements.
+func (r *Reader) Open(name string) (fs.File, error) {
+	r.initFileList()
+
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "open", Path: name, Err: fs.ErrInvalid}
+	}
+	e := r.openLookup(name)
+	if e == nil {
+		return nil, &fs.PathError{Op: "open", Path: name, Err: fs.ErrNotExist}
+	}
+	if e.isDir {
+		return &openDir{e, r.openReadDir(name), 0}, nil
+	}
+	rc, err := e.file.Open()
+	if err != nil {
+		return nil, err
+	}
+	return rc.(fs.File), nil
+}
+
+func split(name string) (dir, elem string, isDir bool) {
+	if len(name) > 0 && name[len(name)-1] == '/' {
+		isDir = true
+		name = name[:len(name)-1]
+	}
+	i := len(name) - 1
+	for i >= 0 && name[i] != '/' {
+		i--
+	}
+	if i < 0 {
+		return ".", name, isDir
+	}
+	return name[:i], name[i+1:], isDir
+}
+
+var dotFile = &fileListEntry{name: "./", isDir: true}
+
+func (r *Reader) openLookup(name string) *fileListEntry {
+	if name == "." {
+		return dotFile
+	}
+
+	dir, elem, _ := split(name)
+	files := r.fileList
+	i := sort.Search(len(files), func(i int) bool {
+		idir, ielem, _ := split(files[i].name)
+		return idir > dir || idir == dir && ielem >= elem
+	})
+	if i < len(files) {
+		fname := files[i].name
+		if fname == name || len(fname) == len(name)+1 && fname[len(name)] == '/' && fname[:len(name)] == name {
+			return &files[i]
+		}
+	}
+	return nil
+}
+
+func (r *Reader) openReadDir(dir string) []fileListEntry {
+	files := r.fileList
+	i := sort.Search(len(files), func(i int) bool {
+		idir, _, _ := split(files[i].name)
+		return idir >= dir
+	})
+	j := sort.Search(len(files), func(j int) bool {
+		jdir, _, _ := split(files[j].name)
+		return jdir > dir
+	})
+	return files[i:j]
+}
+
+type openDir struct {
+	e      *fileListEntry
+	files  []fileListEntry
+	offset int
+}
+
+func (d *openDir) Close() error               { return nil }
+func (d *openDir) Stat() (fs.FileInfo, error) { return d.e.stat() }
+
+func (d *openDir) Read([]byte) (int, error) {
+	return 0, &fs.PathError{Op: "read", Path: d.e.name, Err: errors.New("is a directory")}
+}
+
+func (d *openDir) ReadDir(count int) ([]fs.DirEntry, error) {
+	n := len(d.files) - d.offset
+	if count > 0 && n > count {
+		n = count
+	}
+	if n == 0 {
+		if count <= 0 {
+			return nil, nil
+		}
+		return nil, io.EOF
+	}
+	list := make([]fs.DirEntry, n)
+	for i := range list {
+		s, err := d.files[d.offset+i].stat()
+		if err != nil {
+			return nil, err
+		}
+		list[i] = s
+	}
+	d.offset += n
+	return list, nil
+}
diff --git a/vendor/github.com/STARRY-S/zip/register.go b/vendor/github.com/STARRY-S/zip/register.go
new file mode 100644
index 0000000000..4389246286
--- /dev/null
+++ b/vendor/github.com/STARRY-S/zip/register.go
@@ -0,0 +1,147 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package zip
+
+import (
+	"compress/flate"
+	"errors"
+	"io"
+	"sync"
+)
+
+// A Compressor returns a new compressing writer, writing to w.
+// The WriteCloser's Close method must be used to flush pending data to w.
+// The Compressor itself must be safe to invoke from multiple goroutines
+// simultaneously, but each returned writer will be used only by
+// one goroutine at a time.
+type Compressor func(w io.Writer) (io.WriteCloser, error)
+
+// A Decompressor returns a new decompressing reader, reading from r.
+// The ReadCloser's Close method must be used to release associated resources.
+// The Decompressor itself must be safe to invoke from multiple goroutines
+// simultaneously, but each returned reader will be used only by
+// one goroutine at a time.
+type Decompressor func(r io.Reader) io.ReadCloser
+
+var flateWriterPool sync.Pool
+
+func newFlateWriter(w io.Writer) io.WriteCloser {
+	fw, ok := flateWriterPool.Get().(*flate.Writer)
+	if ok {
+		fw.Reset(w)
+	} else {
+		fw, _ = flate.NewWriter(w, 5)
+	}
+	return &pooledFlateWriter{fw: fw}
+}
+
+type pooledFlateWriter struct {
+	mu sync.Mutex // guards Close and Write
+	fw *flate.Writer
+}
+
+func (w *pooledFlateWriter) Write(p []byte) (n int, err error) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.fw == nil {
+		return 0, errors.New("Write after Close")
+	}
+	return w.fw.Write(p)
+}
+
+func (w *pooledFlateWriter) Close() error {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	var err error
+	if w.fw != nil {
+		err = w.fw.Close()
+		flateWriterPool.Put(w.fw)
+		w.fw = nil
+	}
+	return err
+}
+
+var flateReaderPool sync.Pool
+
+func newFlateReader(r io.Reader) io.ReadCloser {
+	fr, ok := flateReaderPool.Get().(io.ReadCloser)
+	if ok {
+		fr.(flate.Resetter).Reset(r, nil)
+	} else {
+		fr = flate.NewReader(r)
+	}
+	return &pooledFlateReader{fr: fr}
+}
+
+type pooledFlateReader struct {
+	mu sync.Mutex // guards Close and Read
+	fr io.ReadCloser
+}
+
+func (r *pooledFlateReader) Read(p []byte) (n int, err error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if r.fr == nil {
+		return 0, errors.New("Read after Close")
+	}
+	return r.fr.Read(p)
+}
+
+func (r *pooledFlateReader) Close() error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	var err error
+	if r.fr != nil {
+		err = r.fr.Close()
+		flateReaderPool.Put(r.fr)
+		r.fr = nil
+	}
+	return err
+}
+
+var (
+	compressors   sync.Map // map[uint16]Compressor
+	decompressors sync.Map // map[uint16]Decompressor
+)
+
+func init() {
+	compressors.Store(Store, Compressor(func(w io.Writer) (io.WriteCloser, error) { return &nopCloser{w}, nil }))
+	compressors.Store(Deflate, Compressor(func(w io.Writer) (io.WriteCloser, error) { return newFlateWriter(w), nil }))
+
+	decompressors.Store(Store, Decompressor(io.NopCloser))
+	decompressors.Store(Deflate, Decompressor(newFlateReader))
+}
+
+// RegisterDecompressor allows custom decompressors for a specified method ID.
+// The common methods Store and Deflate are built in.
+func RegisterDecompressor(method uint16, dcomp Decompressor) {
+	if _, dup := decompressors.LoadOrStore(method, dcomp); dup {
+		panic("decompressor already registered")
+	}
+}
+
+// RegisterCompressor registers custom compressors for a specified method ID.
+// The common methods Store and Deflate are built in.
+func RegisterCompressor(method uint16, comp Compressor) {
+	if _, dup := compressors.LoadOrStore(method, comp); dup {
+		panic("compressor already registered")
+	}
+}
+
+func compressor(method uint16) Compressor {
+	ci, ok := compressors.Load(method)
+	if !ok {
+		return nil
+	}
+	return ci.(Compressor)
+}
+
+func decompressor(method uint16) Decompressor {
+	di, ok := decompressors.Load(method)
+	if !ok {
+		return nil
+	}
+	return di.(Decompressor)
+}
diff --git a/vendor/github.com/STARRY-S/zip/struct.go b/vendor/github.com/STARRY-S/zip/struct.go
new file mode 100644
index 0000000000..9a8e67cc69
--- /dev/null
+++ b/vendor/github.com/STARRY-S/zip/struct.go
@@ -0,0 +1,419 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package zip provides support for reading and writing ZIP archives.
+
+See the [ZIP specification] for details.
+
+This package does not support disk spanning.
+
+A note about ZIP64:
+
+To be backwards compatible the FileHeader has both 32 and 64 bit Size
+fields. The 64 bit fields will always contain the correct value and
+for normal archives both fields will be the same. For files requiring
+the ZIP64 format the 32 bit fields will be 0xffffffff and the 64 bit
+fields must be used instead.
+
+[ZIP specification]: https://www.pkware.com/appnote
+*/
+package zip
+
+import (
+	"io/fs"
+	"path"
+	"time"
+)
+
+// Compression methods.
+const (
+	Store   uint16 = 0 // no compression
+	Deflate uint16 = 8 // DEFLATE compressed
+)
+
+const (
+	fileHeaderSignature      = 0x04034b50
+	directoryHeaderSignature = 0x02014b50
+	directoryEndSignature    = 0x06054b50
+	directory64LocSignature  = 0x07064b50
+	directory64EndSignature  = 0x06064b50
+	dataDescriptorSignature  = 0x08074b50 // de-facto standard; required by OS X Finder
+	fileHeaderLen            = 30         // + filename + extra
+	directoryHeaderLen       = 46         // + filename + extra + comment
+	directoryEndLen          = 22         // + comment
+	dataDescriptorLen        = 16         // four uint32: descriptor signature, crc32, compressed size, size
+	dataDescriptor64Len      = 24         // two uint32: signature, crc32 | two uint64: compressed size, size
+	directory64LocLen        = 20         //
+	directory64EndLen        = 56         // + extra
+
+	// Constants for the first byte in CreatorVersion.
+	creatorFAT    = 0
+	creatorUnix   = 3
+	creatorNTFS   = 11
+	creatorVFAT   = 14
+	creatorMacOSX = 19
+
+	// Version numbers.
+	zipVersion20 = 20 // 2.0
+	zipVersion45 = 45 // 4.5 (reads and writes zip64 archives)
+
+	// Limits for non zip64 files.
+	uint16max = (1 << 16) - 1
+	uint32max = (1 << 32) - 1
+
+	// Extra header IDs.
+	//
+	// IDs 0..31 are reserved for official use by PKWARE.
+	// IDs above that range are defined by third-party vendors.
+	// Since ZIP lacked high precision timestamps (nor an official specification
+	// of the timezone used for the date fields), many competing extra fields
+	// have been invented. Pervasive use effectively makes them "official".
+	//
+	// See http://mdfs.net/Docs/Comp/Archiving/Zip/ExtraField
+	zip64ExtraID       = 0x0001 // Zip64 extended information
+	ntfsExtraID        = 0x000a // NTFS
+	unixExtraID        = 0x000d // UNIX
+	extTimeExtraID     = 0x5455 // Extended timestamp
+	infoZipUnixExtraID = 0x5855 // Info-ZIP Unix extension
+)
+
+// FileHeader describes a file within a ZIP file.
+// See the [ZIP specification] for details.
+//
+// [ZIP specification]: https://www.pkware.com/appnote
+type FileHeader struct {
+	// Name is the name of the file.
+	//
+	// It must be a relative path, not start with a drive letter (such as "C:"),
+	// and must use forward slashes instead of back slashes. A trailing slash
+	// indicates that this file is a directory and should have no data.
+	Name string
+
+	// Comment is any arbitrary user-defined string shorter than 64KiB.
+	Comment string
+
+	// NonUTF8 indicates that Name and Comment are not encoded in UTF-8.
+	//
+	// By specification, the only other encoding permitted should be CP-437,
+	// but historically many ZIP readers interpret Name and Comment as whatever
+	// the system's local character encoding happens to be.
+	//
+	// This flag should only be set if the user intends to encode a non-portable
+	// ZIP file for a specific localized region. Otherwise, the Writer
+	// automatically sets the ZIP format's UTF-8 flag for valid UTF-8 strings.
+	NonUTF8 bool
+
+	CreatorVersion uint16
+	ReaderVersion  uint16
+	Flags          uint16
+
+	// Method is the compression method. If zero, Store is used.
+	Method uint16
+
+	// Modified is the modified time of the file.
+	//
+	// When reading, an extended timestamp is preferred over the legacy MS-DOS
+	// date field, and the offset between the times is used as the timezone.
+	// If only the MS-DOS date is present, the timezone is assumed to be UTC.
+	//
+	// When writing, an extended timestamp (which is timezone-agnostic) is
+	// always emitted. The legacy MS-DOS date field is encoded according to the
+	// location of the Modified time.
+	Modified time.Time
+
+	// ModifiedTime is an MS-DOS-encoded time.
+	//
+	// Deprecated: Use Modified instead.
+	ModifiedTime uint16
+
+	// ModifiedDate is an MS-DOS-encoded date.
+	//
+	// Deprecated: Use Modified instead.
+	ModifiedDate uint16
+
+	// CRC32 is the CRC32 checksum of the file content.
+	CRC32 uint32
+
+	// CompressedSize is the compressed size of the file in bytes.
+	// If either the uncompressed or compressed size of the file
+	// does not fit in 32 bits, CompressedSize is set to ^uint32(0).
+	//
+	// Deprecated: Use CompressedSize64 instead.
+	CompressedSize uint32
+
+	// UncompressedSize is the compressed size of the file in bytes.
+	// If either the uncompressed or compressed size of the file
+	// does not fit in 32 bits, CompressedSize is set to ^uint32(0).
+	//
+	// Deprecated: Use UncompressedSize64 instead.
+	UncompressedSize uint32
+
+	// CompressedSize64 is the compressed size of the file in bytes.
+	CompressedSize64 uint64
+
+	// UncompressedSize64 is the uncompressed size of the file in bytes.
+	UncompressedSize64 uint64
+
+	Extra         []byte
+	ExternalAttrs uint32 // Meaning depends on CreatorVersion
+}
+
+// FileInfo returns an fs.FileInfo for the FileHeader.
+func (h *FileHeader) FileInfo() fs.FileInfo {
+	return headerFileInfo{h}
+}
+
+// headerFileInfo implements fs.FileInfo.
+type headerFileInfo struct {
+	fh *FileHeader
+}
+
+func (fi headerFileInfo) Name() string { return path.Base(fi.fh.Name) }
+func (fi headerFileInfo) Size() int64 {
+	if fi.fh.UncompressedSize64 > 0 {
+		return int64(fi.fh.UncompressedSize64)
+	}
+	return int64(fi.fh.UncompressedSize)
+}
+func (fi headerFileInfo) IsDir() bool { return fi.Mode().IsDir() }
+func (fi headerFileInfo) ModTime() time.Time {
+	if fi.fh.Modified.IsZero() {
+		return fi.fh.ModTime()
+	}
+	return fi.fh.Modified.UTC()
+}
+func (fi headerFileInfo) Mode() fs.FileMode { return fi.fh.Mode() }
+func (fi headerFileInfo) Type() fs.FileMode { return fi.fh.Mode().Type() }
+func (fi headerFileInfo) Sys() any          { return fi.fh }
+
+func (fi headerFileInfo) Info() (fs.FileInfo, error) { return fi, nil }
+
+func (fi headerFileInfo) String() string {
+	return fs.FormatFileInfo(fi)
+}
+
+// FileInfoHeader creates a partially-populated FileHeader from an
+// fs.FileInfo.
+// Because fs.FileInfo's Name method returns only the base name of
+// the file it describes, it may be necessary to modify the Name field
+// of the returned header to provide the full path name of the file.
+// If compression is desired, callers should set the FileHeader.Method
+// field; it is unset by default.
+func FileInfoHeader(fi fs.FileInfo) (*FileHeader, error) {
+	size := fi.Size()
+	fh := &FileHeader{
+		Name:               fi.Name(),
+		UncompressedSize64: uint64(size),
+	}
+	fh.SetModTime(fi.ModTime())
+	fh.SetMode(fi.Mode())
+	if fh.UncompressedSize64 > uint32max {
+		fh.UncompressedSize = uint32max
+	} else {
+		fh.UncompressedSize = uint32(fh.UncompressedSize64)
+	}
+	return fh, nil
+}
+
+type directoryEnd struct {
+	diskNbr            uint32 // unused
+	dirDiskNbr         uint32 // unused
+	dirRecordsThisDisk uint64 // unused
+	directoryRecords   uint64
+	directorySize      uint64
+	directoryOffset    uint64 // relative to file
+	commentLen         uint16
+	comment            string
+}
+
+// timeZone returns a *time.Location based on the provided offset.
+// If the offset is non-sensible, then this uses an offset of zero.
+func timeZone(offset time.Duration) *time.Location {
+	const (
+		minOffset   = -12 * time.Hour  // E.g., Baker island at -12:00
+		maxOffset   = +14 * time.Hour  // E.g., Line island at +14:00
+		offsetAlias = 15 * time.Minute // E.g., Nepal at +5:45
+	)
+	offset = offset.Round(offsetAlias)
+	if offset < minOffset || maxOffset < offset {
+		offset = 0
+	}
+	return time.FixedZone("", int(offset/time.Second))
+}
+
+// msDosTimeToTime converts an MS-DOS date and time into a time.Time.
+// The resolution is 2s.
+// See: https://msdn.microsoft.com/en-us/library/ms724247(v=VS.85).aspx
+func msDosTimeToTime(dosDate, dosTime uint16) time.Time {
+	return time.Date(
+		// date bits 0-4: day of month; 5-8: month; 9-15: years since 1980
+		int(dosDate>>9+1980),
+		time.Month(dosDate>>5&0xf),
+		int(dosDate&0x1f),
+
+		// time bits 0-4: second/2; 5-10: minute; 11-15: hour
+		int(dosTime>>11),
+		int(dosTime>>5&0x3f),
+		int(dosTime&0x1f*2),
+		0, // nanoseconds
+
+		time.UTC,
+	)
+}
+
+// timeToMsDosTime converts a time.Time to an MS-DOS date and time.
+// The resolution is 2s.
+// See: https://msdn.microsoft.com/en-us/library/ms724274(v=VS.85).aspx
+func timeToMsDosTime(t time.Time) (fDate uint16, fTime uint16) {
+	fDate = uint16(t.Day() + int(t.Month())<<5 + (t.Year()-1980)<<9)
+	fTime = uint16(t.Second()/2 + t.Minute()<<5 + t.Hour()<<11)
+	return
+}
+
+// ModTime returns the modification time in UTC using the legacy
+// ModifiedDate and ModifiedTime fields.
+//
+// Deprecated: Use Modified instead.
+func (h *FileHeader) ModTime() time.Time {
+	return msDosTimeToTime(h.ModifiedDate, h.ModifiedTime)
+}
+
+// SetModTime sets the Modified, ModifiedTime, and ModifiedDate fields
+// to the given time in UTC.
+//
+// Deprecated: Use Modified instead.
+func (h *FileHeader) SetModTime(t time.Time) {
+	t = t.UTC() // Convert to UTC for compatibility
+	h.Modified = t
+	h.ModifiedDate, h.ModifiedTime = timeToMsDosTime(t)
+}
+
+const (
+	// Unix constants. The specification doesn't mention them,
+	// but these seem to be the values agreed on by tools.
+	s_IFMT   = 0xf000
+	s_IFSOCK = 0xc000
+	s_IFLNK  = 0xa000
+	s_IFREG  = 0x8000
+	s_IFBLK  = 0x6000
+	s_IFDIR  = 0x4000
+	s_IFCHR  = 0x2000
+	s_IFIFO  = 0x1000
+	s_ISUID  = 0x800
+	s_ISGID  = 0x400
+	s_ISVTX  = 0x200
+
+	msdosDir      = 0x10
+	msdosReadOnly = 0x01
+)
+
+// Mode returns the permission and mode bits for the FileHeader.
+func (h *FileHeader) Mode() (mode fs.FileMode) {
+	switch h.CreatorVersion >> 8 {
+	case creatorUnix, creatorMacOSX:
+		mode = unixModeToFileMode(h.ExternalAttrs >> 16)
+	case creatorNTFS, creatorVFAT, creatorFAT:
+		mode = msdosModeToFileMode(h.ExternalAttrs)
+	}
+	if len(h.Name) > 0 && h.Name[len(h.Name)-1] == '/' {
+		mode |= fs.ModeDir
+	}
+	return mode
+}
+
+// SetMode changes the permission and mode bits for the FileHeader.
+func (h *FileHeader) SetMode(mode fs.FileMode) {
+	h.CreatorVersion = h.CreatorVersion&0xff | creatorUnix<<8
+	h.ExternalAttrs = fileModeToUnixMode(mode) << 16
+
+	// set MSDOS attributes too, as the original zip does.
+	if mode&fs.ModeDir != 0 {
+		h.ExternalAttrs |= msdosDir
+	}
+	if mode&0200 == 0 {
+		h.ExternalAttrs |= msdosReadOnly
+	}
+}
+
+// isZip64 reports whether the file size exceeds the 32 bit limit
+func (h *FileHeader) isZip64() bool {
+	return h.CompressedSize64 >= uint32max || h.UncompressedSize64 >= uint32max
+}
+
+func (h *FileHeader) hasDataDescriptor() bool {
+	return h.Flags&0x8 != 0
+}
+
+func msdosModeToFileMode(m uint32) (mode fs.FileMode) {
+	if m&msdosDir != 0 {
+		mode = fs.ModeDir | 0777
+	} else {
+		mode = 0666
+	}
+	if m&msdosReadOnly != 0 {
+		mode &^= 0222
+	}
+	return mode
+}
+
+func fileModeToUnixMode(mode fs.FileMode) uint32 {
+	var m uint32
+	switch mode & fs.ModeType {
+	default:
+		m = s_IFREG
+	case fs.ModeDir:
+		m = s_IFDIR
+	case fs.ModeSymlink:
+		m = s_IFLNK
+	case fs.ModeNamedPipe:
+		m = s_IFIFO
+	case fs.ModeSocket:
+		m = s_IFSOCK
+	case fs.ModeDevice:
+		m = s_IFBLK
+	case fs.ModeDevice | fs.ModeCharDevice:
+		m = s_IFCHR
+	}
+	if mode&fs.ModeSetuid != 0 {
+		m |= s_ISUID
+	}
+	if mode&fs.ModeSetgid != 0 {
+		m |= s_ISGID
+	}
+	if mode&fs.ModeSticky != 0 {
+		m |= s_ISVTX
+	}
+	return m | uint32(mode&0777)
+}
+
+func unixModeToFileMode(m uint32) fs.FileMode {
+	mode := fs.FileMode(m & 0777)
+	switch m & s_IFMT {
+	case s_IFBLK:
+		mode |= fs.ModeDevice
+	case s_IFCHR:
+		mode |= fs.ModeDevice | fs.ModeCharDevice
+	case s_IFDIR:
+		mode |= fs.ModeDir
+	case s_IFIFO:
+		mode |= fs.ModeNamedPipe
+	case s_IFLNK:
+		mode |= fs.ModeSymlink
+	case s_IFREG:
+		// nothing to do
+	case s_IFSOCK:
+		mode |= fs.ModeSocket
+	}
+	if m&s_ISGID != 0 {
+		mode |= fs.ModeSetgid
+	}
+	if m&s_ISUID != 0 {
+		mode |= fs.ModeSetuid
+	}
+	if m&s_ISVTX != 0 {
+		mode |= fs.ModeSticky
+	}
+	return mode
+}
diff --git a/vendor/github.com/STARRY-S/zip/updater.go b/vendor/github.com/STARRY-S/zip/updater.go
new file mode 100644
index 0000000000..251375c305
--- /dev/null
+++ b/vendor/github.com/STARRY-S/zip/updater.go
@@ -0,0 +1,653 @@
+package zip
+
+import (
+	"errors"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"path/filepath"
+	"slices"
+	"strings"
+)
+
+const bufferSize int64 = 1 << 20 // 1M
+
+// AppendMode specifies the way to append new file to existing zip archive.
+type AppendMode int
+
+const (
+	// ZIP_APPEND_OVERWRITE removes the existing file data and append the new
+	// data to the end of the zip archive.
+	APPEND_MODE_OVERWRITE AppendMode = iota
+
+	// ZIP_APPEND_KEEP_ORIGINAL will keep the original file data and only
+	// write the new file data at the end of the existing zip archive file.
+	// This mode will keep multiple file with same name into one archive file.
+	APPEND_MODE_KEEP_ORIGINAL
+)
+
+// sectionReaderWriter implements [io.Reader], [io.Writer], [io.Seeker],
+// [io.ReaderAt], [io.WriterAt] interfaces based on [io.ReadWriteSeeker].
+type sectionReaderWriter struct {
+	rws io.ReadWriteSeeker
+}
+
+func newSectionReaderWriter(rws io.ReadWriteSeeker) *sectionReaderWriter {
+	return &sectionReaderWriter{
+		rws: rws,
+	}
+}
+
+func (s *sectionReaderWriter) ReadAt(p []byte, offset int64) (int, error) {
+	currOffset, err := s.rws.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return 0, err
+	}
+	defer s.rws.Seek(currOffset, io.SeekStart)
+	_, err = s.rws.Seek(offset, io.SeekStart)
+	if err != nil {
+		return 0, err
+	}
+	return s.rws.Read(p)
+}
+
+func (s *sectionReaderWriter) WriteAt(p []byte, offset int64) (n int, err error) {
+	currOffset, err := s.rws.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return 0, err
+	}
+	defer s.rws.Seek(currOffset, io.SeekStart)
+	_, err = s.rws.Seek(offset, io.SeekStart)
+	if err != nil {
+		return 0, err
+	}
+	return s.rws.Write(p)
+}
+
+func (s *sectionReaderWriter) Seek(offset int64, whence int) (int64, error) {
+	return s.rws.Seek(offset, whence)
+}
+
+func (s *sectionReaderWriter) Read(p []byte) (n int, err error) {
+	return s.rws.Read(p)
+}
+
+func (s *sectionReaderWriter) Write(p []byte) (n int, err error) {
+	return s.rws.Write(p)
+}
+
+func (s *sectionReaderWriter) offset() (int64, error) {
+	return s.rws.Seek(0, io.SeekCurrent)
+}
+
+type Directory struct {
+	FileHeader
+	offset int64 // header offset
+}
+
+func (d *Directory) HeaderOffset() int64 {
+	return d.offset
+}
+
+// Updater allows to modify & append files into an existing zip archive without
+// decompress the whole file.
+type Updater struct {
+	rw          *sectionReaderWriter
+	offset      int64
+	dir         []*header
+	last        *fileWriter
+	closed      bool
+	compressors map[uint16]Compressor
+	comment     string
+
+	// Some JAR files are zip files with a prefix that is a bash script.
+	// The baseOffset field is the start of the zip file proper.
+	baseOffset int64
+	// dirOffset is the offset to write the directory record.
+	// Note that the dirOffset may not equal to the last file data end offset.
+	dirOffset int64
+}
+
+// NewUpdater returns a new Updater from [io.ReadWriteSeeker], which is
+// assumed to have the given size in bytes.
+func NewUpdater(rws io.ReadWriteSeeker) (*Updater, error) {
+	size, err := rws.Seek(0, io.SeekEnd)
+	if err != nil {
+		return nil, err
+	}
+	zu := &Updater{
+		rw: newSectionReaderWriter(rws),
+	}
+	if err = zu.init(size); err != nil && err != ErrInsecurePath {
+		return nil, err
+	}
+	return zu, nil
+}
+
+func (u *Updater) init(size int64) error {
+	end, baseOffset, err := readDirectoryEnd(u.rw, size)
+	if err != nil {
+		return err
+	}
+	u.baseOffset = baseOffset
+	u.dirOffset = int64(end.directoryOffset)
+	// Since the number of directory records is not validated, it is not
+	// safe to preallocate r.File without first checking that the specified
+	// number of files is reasonable, since a malformed archive may
+	// indicate it contains up to 1 << 128 - 1 files. Since each file has a
+	// header which will be _at least_ 30 bytes we can safely preallocate
+	// if (data size / 30) >= end.directoryRecords.
+	if end.directorySize < uint64(size) && (uint64(size)-end.directorySize)/30 >= end.directoryRecords {
+		u.dir = make([]*header, 0, end.directoryRecords)
+	}
+	u.comment = end.comment
+	if _, err = u.rw.Seek(u.baseOffset+int64(end.directoryOffset), io.SeekStart); err != nil {
+		return err
+	}
+
+	// The count of files inside a zip is truncated to fit in a uint16.
+	// Gloss over this by reading headers until we encounter
+	// a bad one, and then only report an ErrFormat or UnexpectedEOF if
+	// the file count modulo 65536 is incorrect.
+	for {
+		f := &File{zip: nil, zipr: u.rw}
+		err = readDirectoryHeader(f, u.rw)
+		if err == ErrFormat || err == io.ErrUnexpectedEOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+		f.headerOffset += u.baseOffset
+		h := &header{
+			FileHeader: &f.FileHeader,
+			offset:     uint64(f.headerOffset),
+		}
+		u.dir = append(u.dir, h)
+	}
+	if uint16(len(u.dir)) != uint16(end.directoryRecords) { // only compare 16 bits here
+		// Return the readDirectoryHeader error if we read
+		// the wrong number of directory entries.
+		return err
+	}
+
+	// Ensure the directory record is ordered by file header offset.
+	slices.SortFunc(u.dir, sortDirectoryFunc)
+	for _, d := range u.dir {
+		if d.Name == "" {
+			// Zip permits an empty file name field.
+			continue
+		}
+		// The zip specification states that names must use forward slashes,
+		// so consider any backslashes in the name insecure.
+		if !filepath.IsLocal(d.Name) || strings.Contains(d.Name, "\\") {
+			return ErrInsecurePath
+		}
+	}
+	return nil
+}
+
+// Append adds a file to the zip file using the provided name.
+// It returns a [Writer] to which the file contents should be written.
+// The file contents will be compressed using the Deflate method.
+// The name must be a relative path: it must not start with a drive
+// letter (e.g. C:) or leading slash, and only forward slashes are
+// allowed. To create a directory instead of a file, add a trailing
+// slash to the name.
+//
+// If mode is set to [APPEND_MODE_OVERWRITE], and file name already exists
+// in the zip archive, Append will delete the existing file data and write the
+// new file data at the end of the zip file.
+//
+// If mode is set to [APPEND_MODE_KEEP_ORIGINAL], the existing data won't be
+// deleted from the zip file and Append only write the file data with the same
+// file name at the end of the zip file.
+//
+// The file's contents must be written to the io.Writer before the next
+// call to [Append], [AppendHeader], or [Close].
+func (u *Updater) Append(name string, mode AppendMode) (io.Writer, error) {
+	h := &FileHeader{
+		Name:   name,
+		Method: Deflate,
+	}
+	return u.AppendHeader(h, mode)
+}
+
+func (u *Updater) prepare(fh *FileHeader) error {
+	if u.last != nil && !u.last.closed {
+		if err := u.last.close(); err != nil {
+			return err
+		}
+		offset, err := u.rw.offset()
+		if err != nil {
+			return err
+		}
+		if u.dirOffset < offset {
+			u.dirOffset = offset
+		}
+	}
+	if len(u.dir) > 0 && u.dir[len(u.dir)-1].FileHeader == fh {
+		// See https://golang.org/issue/11144 confusion.
+		return errors.New("archive/zip: invalid duplicate FileHeader")
+	}
+	return nil
+}
+
+// AppendHeader adds a file to the zip archive using the provided [FileHeader]
+// for the file metadata to the specific offset.
+// Writer takes ownership of fh and may mutate its fields.
+// The caller must not modify fh after calling CreateHeader.
+//
+// If the file name of the [FileHeader] already exists in the zip file,
+// AppendHeader will remove the existing file data and the new file data will
+// write at the end of the archive file.
+//
+// It should be noted that the size of the newly appended file size should be
+// larger than the size of the replaced file. Especially when using the Deflate
+// compression method, the compressed data size should be larger than the
+// original file data size.
+func (u *Updater) AppendHeader(fh *FileHeader, mode AppendMode) (io.Writer, error) {
+	if err := u.prepare(fh); err != nil {
+		return nil, err
+	}
+
+	var err error
+	var offset int64 = -1
+	var existingDirIndex int = -1
+	if mode == APPEND_MODE_OVERWRITE {
+		for i, d := range u.dir {
+			if d.Name == fh.Name {
+				offset = int64(d.offset)
+				existingDirIndex = i
+				break
+			}
+		}
+	}
+	if offset < 0 {
+		offset = u.dirOffset
+	}
+	if existingDirIndex >= 0 {
+		if offset, err = u.removeFile(existingDirIndex); err != nil {
+			return nil, err
+		}
+	}
+
+	// Seek the file offset.
+	if _, err := u.rw.Seek(offset, io.SeekStart); err != nil {
+		return nil, err
+	}
+	u.offset = offset
+
+	// The ZIP format has a sad state of affairs regarding character encoding.
+	// Officially, the name and comment fields are supposed to be encoded
+	// in CP-437 (which is mostly compatible with ASCII), unless the UTF-8
+	// flag bit is set. However, there are several problems:
+	//
+	//	* Many ZIP readers still do not support UTF-8.
+	//	* If the UTF-8 flag is cleared, several readers simply interpret the
+	//	name and comment fields as whatever the local system encoding is.
+	//
+	// In order to avoid breaking readers without UTF-8 support,
+	// we avoid setting the UTF-8 flag if the strings are CP-437 compatible.
+	// However, if the strings require multibyte UTF-8 encoding and is a
+	// valid UTF-8 string, then we set the UTF-8 bit.
+	//
+	// For the case, where the user explicitly wants to specify the encoding
+	// as UTF-8, they will need to set the flag bit themselves.
+	utf8Valid1, utf8Require1 := detectUTF8(fh.Name)
+	utf8Valid2, utf8Require2 := detectUTF8(fh.Comment)
+	switch {
+	case fh.NonUTF8:
+		fh.Flags &^= 0x800
+	case (utf8Require1 || utf8Require2) && (utf8Valid1 && utf8Valid2):
+		fh.Flags |= 0x800
+	}
+
+	fh.CreatorVersion = fh.CreatorVersion&0xff00 | zipVersion20 // preserve compatibility byte
+	fh.ReaderVersion = zipVersion20
+
+	// If Modified is set, this takes precedence over MS-DOS timestamp fields.
+	if !fh.Modified.IsZero() {
+		// Contrary to the FileHeader.SetModTime method, we intentionally
+		// do not convert to UTC, because we assume the user intends to encode
+		// the date using the specified timezone. A user may want this control
+		// because many legacy ZIP readers interpret the timestamp according
+		// to the local timezone.
+		//
+		// The timezone is only non-UTC if a user directly sets the Modified
+		// field directly themselves. All other approaches sets UTC.
+		fh.ModifiedDate, fh.ModifiedTime = timeToMsDosTime(fh.Modified)
+
+		// Use "extended timestamp" format since this is what Info-ZIP uses.
+		// Nearly every major ZIP implementation uses a different format,
+		// but at least most seem to be able to understand the other formats.
+		//
+		// This format happens to be identical for both local and central header
+		// if modification time is the only timestamp being encoded.
+		var mbuf [9]byte // 2*SizeOf(uint16) + SizeOf(uint8) + SizeOf(uint32)
+		mt := uint32(fh.Modified.Unix())
+		eb := writeBuf(mbuf[:])
+		eb.uint16(extTimeExtraID)
+		eb.uint16(5)  // Size: SizeOf(uint8) + SizeOf(uint32)
+		eb.uint8(1)   // Flags: ModTime
+		eb.uint32(mt) // ModTime
+		fh.Extra = append(fh.Extra, mbuf[:]...)
+	}
+
+	var (
+		ow io.Writer
+		fw *fileWriter
+	)
+	h := &header{
+		FileHeader: fh,
+		offset:     uint64(u.offset),
+	}
+	if strings.HasSuffix(fh.Name, "/") {
+		// Set the compression method to Store to ensure data length is truly zero,
+		// which the writeHeader method always encodes for the size fields.
+		// This is necessary as most compression formats have non-zero lengths
+		// even when compressing an empty string.
+		fh.Method = Store
+		fh.Flags &^= 0x8 // we will not write a data descriptor
+
+		// Explicitly clear sizes as they have no meaning for directories.
+		fh.CompressedSize = 0
+		fh.CompressedSize64 = 0
+		fh.UncompressedSize = 0
+		fh.UncompressedSize64 = 0
+
+		ow = dirWriter{}
+	} else {
+		fh.Flags |= 0x8 // we will write a data descriptor
+
+		fw = &fileWriter{
+			zipw:      u.rw,
+			compCount: &countWriter{w: u.rw},
+			crc32:     crc32.NewIEEE(),
+		}
+		comp := u.compressor(fh.Method)
+		if comp == nil {
+			return nil, ErrAlgorithm
+		}
+		var err error
+		fw.comp, err = comp(fw.compCount)
+		if err != nil {
+			return nil, err
+		}
+		fw.rawCount = &countWriter{w: fw.comp}
+		fw.header = h
+		ow = fw
+	}
+	u.dir = append(u.dir, h)
+	// No need to re-sort u.dir here since the new created header is write
+	// to the end of the files.
+	if err := writeHeader(u.rw, h); err != nil {
+		return nil, err
+	}
+	// If we're creating a directory, fw is nil.
+	u.last = fw
+	offset, err = u.rw.offset()
+	if err != nil {
+		return nil, err
+	}
+	if u.dirOffset < offset {
+		u.dirOffset = offset
+	}
+
+	return ow, nil
+}
+
+// removeFile removes file in zip by rewinding data and directory record.
+func (u *Updater) removeFile(dirIndex int) (int64, error) {
+	// start is the file header offset.
+	var start = int64(u.dir[dirIndex].offset)
+	// end is the next file header offset or directory offset.
+	var end int64
+	if dirIndex == len(u.dir)-1 {
+		end = u.dirOffset
+	} else {
+		end = int64(u.dir[dirIndex+1].offset)
+	}
+	// size is the file header and compressed data size.
+	var size = end - start
+
+	// Allocate buffer to rewind file data.
+	var buffer = make([]byte, bufferSize)
+	var rp int64 = end   // read point
+	var wp int64 = start // write point
+	// Rewind data in buffer size block.
+	for rp < u.dirOffset-bufferSize {
+		n, err := u.rw.ReadAt(buffer, rp)
+		if err != nil {
+			return 0, fmt.Errorf("zip: rewind data: ReadAt: %w", err)
+		}
+		_, err = u.rw.WriteAt(buffer[:n], wp)
+		if err != nil {
+			return 0, fmt.Errorf("zip: rewind data: WriteAt: %w", err)
+		}
+		rp += int64(n)
+		wp += int64(n)
+	}
+	// Rewind remaining data that smaller than the buffer size block.
+	if rp < u.dirOffset {
+		n, err := u.rw.ReadAt(buffer[:u.dirOffset-rp], rp)
+		if err != nil {
+			return 0, fmt.Errorf("zip: rewind data: ReadAt: %w", err)
+		}
+		_, err = u.rw.WriteAt(buffer[:n], wp)
+		if err != nil {
+			return 0, fmt.Errorf("zip: rewind data: ReadAt: %w", err)
+		}
+		rp += int64(n)
+		wp += int64(n)
+		// assert: rewind data before directory record
+		if rp != u.dirOffset {
+			return 0, errors.New("zip: rewind data: read data before directory failed")
+		}
+	}
+	// Remove deleted file directory record.
+	u.dir = append(u.dir[:dirIndex], u.dir[dirIndex+1:len(u.dir)]...)
+	// Update the file header offset in directory record.
+	for i := dirIndex; i < len(u.dir); i++ {
+		u.dir[i].offset -= uint64(size)
+		u.dir[i].Extra = nil // Will re-generate zip64 extra data when calling
+	}
+	return wp, nil
+}
+
+func (u *Updater) compressor(method uint16) Compressor {
+	comp := u.compressors[method]
+	if comp == nil {
+		comp = compressor(method)
+	}
+	return comp
+}
+
+func (u *Updater) SetComment(comment string) error {
+	if len(comment) > uint16max {
+		return errors.New("zip: Writer.Comment too long")
+	}
+	u.comment = comment
+	return nil
+}
+
+func (u *Updater) GetComment() string {
+	return u.comment
+}
+
+func (u *Updater) Close() error {
+	if u.last != nil && !u.last.closed {
+		if err := u.last.close(); err != nil {
+			return err
+		}
+		u.last = nil
+	}
+	if u.closed {
+		return errors.New("zip: updater closed twice")
+	}
+	u.closed = true
+
+	// write central directory
+	start, err := u.rw.offset()
+	if err != nil {
+		return err
+	}
+	if start < u.dirOffset {
+		// Make data to `\0` between the last file and the diretory record.
+		// NOTE: this step is not mandatory but will make the file data clean.
+		var buffSize int64
+		var buffer []byte
+		size := u.dirOffset - start
+		if u.dirOffset-start > bufferSize {
+			buffer = make([]byte, bufferSize)
+			buffSize = bufferSize
+		} else {
+			buffer = make([]byte, size)
+			buffSize = size
+		}
+		var wp = start
+		_, err = u.rw.Seek(wp, io.SeekStart)
+		if err != nil {
+			return err
+		}
+		// Write `\0` in block size.
+		for wp < u.dirOffset-buffSize {
+			n, err := u.rw.Write(buffer)
+			if err != nil {
+				return err
+			}
+			wp += int64(n)
+		}
+		if wp < u.dirOffset {
+			if _, err := u.rw.Write(buffer[:u.dirOffset-wp]); err != nil {
+				return err
+			}
+		}
+		start = u.dirOffset
+	}
+	for _, h := range u.dir {
+		var buf []byte = make([]byte, directoryHeaderLen)
+		b := writeBuf(buf)
+		b.uint32(uint32(directoryHeaderSignature))
+		b.uint16(h.CreatorVersion)
+		b.uint16(h.ReaderVersion)
+		b.uint16(h.Flags)
+		b.uint16(h.Method)
+		b.uint16(h.ModifiedTime)
+		b.uint16(h.ModifiedDate)
+		b.uint32(h.CRC32)
+		if h.isZip64() || h.offset >= uint32max {
+			// the file needs a zip64 header. store maxint in both
+			// 32 bit size fields (and offset later) to signal that the
+			// zip64 extra header should be used.
+			b.uint32(uint32max) // compressed size
+			b.uint32(uint32max) // uncompressed size
+
+			// append a zip64 extra block to Extra
+			var buf [28]byte // 2x uint16 + 3x uint64
+			eb := writeBuf(buf[:])
+			eb.uint16(zip64ExtraID)
+			eb.uint16(24) // size = 3x uint64
+			eb.uint64(h.UncompressedSize64)
+			eb.uint64(h.CompressedSize64)
+			eb.uint64(uint64(h.offset))
+			h.Extra = append(h.Extra, buf[:]...)
+		} else {
+			b.uint32(h.CompressedSize)
+			b.uint32(h.UncompressedSize)
+		}
+
+		b.uint16(uint16(len(h.Name)))
+		b.uint16(uint16(len(h.Extra)))
+		b.uint16(uint16(len(h.Comment)))
+		b = b[4:] // skip disk number start and internal file attr (2x uint16)
+		b.uint32(h.ExternalAttrs)
+		if h.offset > uint32max {
+			b.uint32(uint32max)
+		} else {
+			b.uint32(uint32(h.offset))
+		}
+		if _, err := u.rw.Write(buf); err != nil {
+			return err
+		}
+		if _, err := io.WriteString(u.rw, h.Name); err != nil {
+			return err
+		}
+		if _, err := u.rw.Write(h.Extra); err != nil {
+			return err
+		}
+		if _, err := io.WriteString(u.rw, h.Comment); err != nil {
+			return err
+		}
+	}
+	end, err := u.rw.offset()
+	if err != nil {
+		return err
+	}
+
+	records := uint64(len(u.dir))
+	size := uint64(end - start)
+	offset := uint64(start)
+
+	if records >= uint16max || size >= uint32max || offset >= uint32max {
+		var buf [directory64EndLen + directory64LocLen]byte
+		b := writeBuf(buf[:])
+
+		// zip64 end of central directory record
+		b.uint32(directory64EndSignature)
+		b.uint64(directory64EndLen - 12) // length minus signature (uint32) and length fields (uint64)
+		b.uint16(zipVersion45)           // version made by
+		b.uint16(zipVersion45)           // version needed to extract
+		b.uint32(0)                      // number of this disk
+		b.uint32(0)                      // number of the disk with the start of the central directory
+		b.uint64(records)                // total number of entries in the central directory on this disk
+		b.uint64(records)                // total number of entries in the central directory
+		b.uint64(size)                   // size of the central directory
+		b.uint64(offset)                 // offset of start of central directory with respect to the starting disk number
+
+		// zip64 end of central directory locator
+		b.uint32(directory64LocSignature)
+		b.uint32(0)           // number of the disk with the start of the zip64 end of central directory
+		b.uint64(uint64(end)) // relative offset of the zip64 end of central directory record
+		b.uint32(1)           // total number of disks
+
+		if _, err := u.rw.Write(buf[:]); err != nil {
+			return err
+		}
+
+		// store max values in the regular end record to signal
+		// that the zip64 values should be used instead
+		records = uint16max
+		size = uint32max
+		offset = uint32max
+	}
+
+	// write end record
+	var buf [directoryEndLen]byte
+	b := writeBuf(buf[:])
+	b.uint32(uint32(directoryEndSignature))
+	b = b[4:]                        // skip over disk number and first disk number (2x uint16)
+	b.uint16(uint16(records))        // number of entries this disk
+	b.uint16(uint16(records))        // number of entries total
+	b.uint32(uint32(size))           // size of directory
+	b.uint32(uint32(offset))         // start of directory
+	b.uint16(uint16(len(u.comment))) // byte size of EOCD comment
+	if _, err := u.rw.Write(buf[:]); err != nil {
+		return err
+	}
+	if _, err := io.WriteString(u.rw, u.comment); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func sortDirectoryFunc(a, b *header) int {
+	switch {
+	case a.offset > b.offset:
+		return 1
+	case a.offset < b.offset:
+		return -1
+	}
+	return 0
+}
diff --git a/vendor/github.com/STARRY-S/zip/writer.go b/vendor/github.com/STARRY-S/zip/writer.go
new file mode 100644
index 0000000000..e33df2431c
--- /dev/null
+++ b/vendor/github.com/STARRY-S/zip/writer.go
@@ -0,0 +1,666 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package zip
+
+import (
+	"bufio"
+	"encoding/binary"
+	"errors"
+	"hash"
+	"hash/crc32"
+	"io"
+	"io/fs"
+	"strings"
+	"unicode/utf8"
+)
+
+var (
+	errLongName  = errors.New("zip: FileHeader.Name too long")
+	errLongExtra = errors.New("zip: FileHeader.Extra too long")
+)
+
+// Writer implements a zip file writer.
+type Writer struct {
+	cw          *countWriter
+	dir         []*header
+	last        *fileWriter
+	closed      bool
+	compressors map[uint16]Compressor
+	comment     string
+
+	// testHookCloseSizeOffset if non-nil is called with the size
+	// of offset of the central directory at Close.
+	testHookCloseSizeOffset func(size, offset uint64)
+}
+
+type header struct {
+	*FileHeader
+	offset uint64
+	raw    bool
+}
+
+// NewWriter returns a new [Writer] writing a zip file to w.
+func NewWriter(w io.Writer) *Writer {
+	return &Writer{cw: &countWriter{w: bufio.NewWriter(w)}}
+}
+
+// SetOffset sets the offset of the beginning of the zip data within the
+// underlying writer. It should be used when the zip data is appended to an
+// existing file, such as a binary executable.
+// It must be called before any data is written.
+func (w *Writer) SetOffset(n int64) {
+	if w.cw.count != 0 {
+		panic("zip: SetOffset called after data was written")
+	}
+	w.cw.count = n
+}
+
+// Flush flushes any buffered data to the underlying writer.
+// Calling Flush is not normally necessary; calling Close is sufficient.
+func (w *Writer) Flush() error {
+	return w.cw.w.(*bufio.Writer).Flush()
+}
+
+// SetComment sets the end-of-central-directory comment field.
+// It can only be called before [Writer.Close].
+func (w *Writer) SetComment(comment string) error {
+	if len(comment) > uint16max {
+		return errors.New("zip: Writer.Comment too long")
+	}
+	w.comment = comment
+	return nil
+}
+
+// Close finishes writing the zip file by writing the central directory.
+// It does not close the underlying writer.
+func (w *Writer) Close() error {
+	if w.last != nil && !w.last.closed {
+		if err := w.last.close(); err != nil {
+			return err
+		}
+		w.last = nil
+	}
+	if w.closed {
+		return errors.New("zip: writer closed twice")
+	}
+	w.closed = true
+
+	// write central directory
+	start := w.cw.count
+	for _, h := range w.dir {
+		var buf [directoryHeaderLen]byte
+		b := writeBuf(buf[:])
+		b.uint32(uint32(directoryHeaderSignature))
+		b.uint16(h.CreatorVersion)
+		b.uint16(h.ReaderVersion)
+		b.uint16(h.Flags)
+		b.uint16(h.Method)
+		b.uint16(h.ModifiedTime)
+		b.uint16(h.ModifiedDate)
+		b.uint32(h.CRC32)
+		if h.isZip64() || h.offset >= uint32max {
+			// the file needs a zip64 header. store maxint in both
+			// 32 bit size fields (and offset later) to signal that the
+			// zip64 extra header should be used.
+			b.uint32(uint32max) // compressed size
+			b.uint32(uint32max) // uncompressed size
+
+			// append a zip64 extra block to Extra
+			var buf [28]byte // 2x uint16 + 3x uint64
+			eb := writeBuf(buf[:])
+			eb.uint16(zip64ExtraID)
+			eb.uint16(24) // size = 3x uint64
+			eb.uint64(h.UncompressedSize64)
+			eb.uint64(h.CompressedSize64)
+			eb.uint64(h.offset)
+			h.Extra = append(h.Extra, buf[:]...)
+		} else {
+			b.uint32(h.CompressedSize)
+			b.uint32(h.UncompressedSize)
+		}
+
+		b.uint16(uint16(len(h.Name)))
+		b.uint16(uint16(len(h.Extra)))
+		b.uint16(uint16(len(h.Comment)))
+		b = b[4:] // skip disk number start and internal file attr (2x uint16)
+		b.uint32(h.ExternalAttrs)
+		if h.offset > uint32max {
+			b.uint32(uint32max)
+		} else {
+			b.uint32(uint32(h.offset))
+		}
+		if _, err := w.cw.Write(buf[:]); err != nil {
+			return err
+		}
+		if _, err := io.WriteString(w.cw, h.Name); err != nil {
+			return err
+		}
+		if _, err := w.cw.Write(h.Extra); err != nil {
+			return err
+		}
+		if _, err := io.WriteString(w.cw, h.Comment); err != nil {
+			return err
+		}
+	}
+	end := w.cw.count
+
+	records := uint64(len(w.dir))
+	size := uint64(end - start)
+	offset := uint64(start)
+
+	if f := w.testHookCloseSizeOffset; f != nil {
+		f(size, offset)
+	}
+
+	if records >= uint16max || size >= uint32max || offset >= uint32max {
+		var buf [directory64EndLen + directory64LocLen]byte
+		b := writeBuf(buf[:])
+
+		// zip64 end of central directory record
+		b.uint32(directory64EndSignature)
+		b.uint64(directory64EndLen - 12) // length minus signature (uint32) and length fields (uint64)
+		b.uint16(zipVersion45)           // version made by
+		b.uint16(zipVersion45)           // version needed to extract
+		b.uint32(0)                      // number of this disk
+		b.uint32(0)                      // number of the disk with the start of the central directory
+		b.uint64(records)                // total number of entries in the central directory on this disk
+		b.uint64(records)                // total number of entries in the central directory
+		b.uint64(size)                   // size of the central directory
+		b.uint64(offset)                 // offset of start of central directory with respect to the starting disk number
+
+		// zip64 end of central directory locator
+		b.uint32(directory64LocSignature)
+		b.uint32(0)           // number of the disk with the start of the zip64 end of central directory
+		b.uint64(uint64(end)) // relative offset of the zip64 end of central directory record
+		b.uint32(1)           // total number of disks
+
+		if _, err := w.cw.Write(buf[:]); err != nil {
+			return err
+		}
+
+		// store max values in the regular end record to signal
+		// that the zip64 values should be used instead
+		records = uint16max
+		size = uint32max
+		offset = uint32max
+	}
+
+	// write end record
+	var buf [directoryEndLen]byte
+	b := writeBuf(buf[:])
+	b.uint32(uint32(directoryEndSignature))
+	b = b[4:]                        // skip over disk number and first disk number (2x uint16)
+	b.uint16(uint16(records))        // number of entries this disk
+	b.uint16(uint16(records))        // number of entries total
+	b.uint32(uint32(size))           // size of directory
+	b.uint32(uint32(offset))         // start of directory
+	b.uint16(uint16(len(w.comment))) // byte size of EOCD comment
+	if _, err := w.cw.Write(buf[:]); err != nil {
+		return err
+	}
+	if _, err := io.WriteString(w.cw, w.comment); err != nil {
+		return err
+	}
+
+	return w.cw.w.(*bufio.Writer).Flush()
+}
+
+// Create adds a file to the zip file using the provided name.
+// It returns a [Writer] to which the file contents should be written.
+// The file contents will be compressed using the [Deflate] method.
+// The name must be a relative path: it must not start with a drive
+// letter (e.g. C:) or leading slash, and only forward slashes are
+// allowed. To create a directory instead of a file, add a trailing
+// slash to the name.
+// The file's contents must be written to the [io.Writer] before the next
+// call to [Writer.Create], [Writer.CreateHeader], or [Writer.Close].
+func (w *Writer) Create(name string) (io.Writer, error) {
+	header := &FileHeader{
+		Name:   name,
+		Method: Deflate,
+	}
+	return w.CreateHeader(header)
+}
+
+// detectUTF8 reports whether s is a valid UTF-8 string, and whether the string
+// must be considered UTF-8 encoding (i.e., not compatible with CP-437, ASCII,
+// or any other common encoding).
+func detectUTF8(s string) (valid, require bool) {
+	for i := 0; i < len(s); {
+		r, size := utf8.DecodeRuneInString(s[i:])
+		i += size
+		// Officially, ZIP uses CP-437, but many readers use the system's
+		// local character encoding. Most encoding are compatible with a large
+		// subset of CP-437, which itself is ASCII-like.
+		//
+		// Forbid 0x7e and 0x5c since EUC-KR and Shift-JIS replace those
+		// characters with localized currency and overline characters.
+		if r < 0x20 || r > 0x7d || r == 0x5c {
+			if !utf8.ValidRune(r) || (r == utf8.RuneError && size == 1) {
+				return false, false
+			}
+			require = true
+		}
+	}
+	return true, require
+}
+
+// prepare performs the bookkeeping operations required at the start of
+// CreateHeader and CreateRaw.
+func (w *Writer) prepare(fh *FileHeader) error {
+	if w.last != nil && !w.last.closed {
+		if err := w.last.close(); err != nil {
+			return err
+		}
+	}
+	if len(w.dir) > 0 && w.dir[len(w.dir)-1].FileHeader == fh {
+		// See https://golang.org/issue/11144 confusion.
+		return errors.New("archive/zip: invalid duplicate FileHeader")
+	}
+	return nil
+}
+
+// CreateHeader adds a file to the zip archive using the provided [FileHeader]
+// for the file metadata. [Writer] takes ownership of fh and may mutate
+// its fields. The caller must not modify fh after calling [Writer.CreateHeader].
+//
+// This returns a [Writer] to which the file contents should be written.
+// The file's contents must be written to the io.Writer before the next
+// call to [Writer.Create], [Writer.CreateHeader], [Writer.CreateRaw], or [Writer.Close].
+func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) {
+	if err := w.prepare(fh); err != nil {
+		return nil, err
+	}
+
+	// The ZIP format has a sad state of affairs regarding character encoding.
+	// Officially, the name and comment fields are supposed to be encoded
+	// in CP-437 (which is mostly compatible with ASCII), unless the UTF-8
+	// flag bit is set. However, there are several problems:
+	//
+	//	* Many ZIP readers still do not support UTF-8.
+	//	* If the UTF-8 flag is cleared, several readers simply interpret the
+	//	name and comment fields as whatever the local system encoding is.
+	//
+	// In order to avoid breaking readers without UTF-8 support,
+	// we avoid setting the UTF-8 flag if the strings are CP-437 compatible.
+	// However, if the strings require multibyte UTF-8 encoding and is a
+	// valid UTF-8 string, then we set the UTF-8 bit.
+	//
+	// For the case, where the user explicitly wants to specify the encoding
+	// as UTF-8, they will need to set the flag bit themselves.
+	utf8Valid1, utf8Require1 := detectUTF8(fh.Name)
+	utf8Valid2, utf8Require2 := detectUTF8(fh.Comment)
+	switch {
+	case fh.NonUTF8:
+		fh.Flags &^= 0x800
+	case (utf8Require1 || utf8Require2) && (utf8Valid1 && utf8Valid2):
+		fh.Flags |= 0x800
+	}
+
+	fh.CreatorVersion = fh.CreatorVersion&0xff00 | zipVersion20 // preserve compatibility byte
+	fh.ReaderVersion = zipVersion20
+
+	// If Modified is set, this takes precedence over MS-DOS timestamp fields.
+	if !fh.Modified.IsZero() {
+		// Contrary to the FileHeader.SetModTime method, we intentionally
+		// do not convert to UTC, because we assume the user intends to encode
+		// the date using the specified timezone. A user may want this control
+		// because many legacy ZIP readers interpret the timestamp according
+		// to the local timezone.
+		//
+		// The timezone is only non-UTC if a user directly sets the Modified
+		// field directly themselves. All other approaches sets UTC.
+		fh.ModifiedDate, fh.ModifiedTime = timeToMsDosTime(fh.Modified)
+
+		// Use "extended timestamp" format since this is what Info-ZIP uses.
+		// Nearly every major ZIP implementation uses a different format,
+		// but at least most seem to be able to understand the other formats.
+		//
+		// This format happens to be identical for both local and central header
+		// if modification time is the only timestamp being encoded.
+		var mbuf [9]byte // 2*SizeOf(uint16) + SizeOf(uint8) + SizeOf(uint32)
+		mt := uint32(fh.Modified.Unix())
+		eb := writeBuf(mbuf[:])
+		eb.uint16(extTimeExtraID)
+		eb.uint16(5)  // Size: SizeOf(uint8) + SizeOf(uint32)
+		eb.uint8(1)   // Flags: ModTime
+		eb.uint32(mt) // ModTime
+		fh.Extra = append(fh.Extra, mbuf[:]...)
+	}
+
+	var (
+		ow io.Writer
+		fw *fileWriter
+	)
+	h := &header{
+		FileHeader: fh,
+		offset:     uint64(w.cw.count),
+	}
+
+	if strings.HasSuffix(fh.Name, "/") {
+		// Set the compression method to Store to ensure data length is truly zero,
+		// which the writeHeader method always encodes for the size fields.
+		// This is necessary as most compression formats have non-zero lengths
+		// even when compressing an empty string.
+		fh.Method = Store
+		fh.Flags &^= 0x8 // we will not write a data descriptor
+
+		// Explicitly clear sizes as they have no meaning for directories.
+		fh.CompressedSize = 0
+		fh.CompressedSize64 = 0
+		fh.UncompressedSize = 0
+		fh.UncompressedSize64 = 0
+
+		ow = dirWriter{}
+	} else {
+		fh.Flags |= 0x8 // we will write a data descriptor
+
+		fw = &fileWriter{
+			zipw:      w.cw,
+			compCount: &countWriter{w: w.cw},
+			crc32:     crc32.NewIEEE(),
+		}
+		comp := w.compressor(fh.Method)
+		if comp == nil {
+			return nil, ErrAlgorithm
+		}
+		var err error
+		fw.comp, err = comp(fw.compCount)
+		if err != nil {
+			return nil, err
+		}
+		fw.rawCount = &countWriter{w: fw.comp}
+		fw.header = h
+		ow = fw
+	}
+	w.dir = append(w.dir, h)
+	if err := writeHeader(w.cw, h); err != nil {
+		return nil, err
+	}
+	// If we're creating a directory, fw is nil.
+	w.last = fw
+	return ow, nil
+}
+
+func writeHeader(w io.Writer, h *header) error {
+	const maxUint16 = 1<<16 - 1
+	if len(h.Name) > maxUint16 {
+		return errLongName
+	}
+	if len(h.Extra) > maxUint16 {
+		return errLongExtra
+	}
+
+	var buf [fileHeaderLen]byte
+	b := writeBuf(buf[:])
+	b.uint32(uint32(fileHeaderSignature))
+	b.uint16(h.ReaderVersion)
+	b.uint16(h.Flags)
+	b.uint16(h.Method)
+	b.uint16(h.ModifiedTime)
+	b.uint16(h.ModifiedDate)
+	// In raw mode (caller does the compression), the values are either
+	// written here or in the trailing data descriptor based on the header
+	// flags.
+	if h.raw && !h.hasDataDescriptor() {
+		b.uint32(h.CRC32)
+		b.uint32(uint32(min(h.CompressedSize64, uint32max)))
+		b.uint32(uint32(min(h.UncompressedSize64, uint32max)))
+	} else {
+		// When this package handle the compression, these values are
+		// always written to the trailing data descriptor.
+		b.uint32(0) // crc32
+		b.uint32(0) // compressed size
+		b.uint32(0) // uncompressed size
+	}
+	b.uint16(uint16(len(h.Name)))
+	b.uint16(uint16(len(h.Extra)))
+	if _, err := w.Write(buf[:]); err != nil {
+		return err
+	}
+	if _, err := io.WriteString(w, h.Name); err != nil {
+		return err
+	}
+	_, err := w.Write(h.Extra)
+	return err
+}
+
+// CreateRaw adds a file to the zip archive using the provided [FileHeader] and
+// returns a [Writer] to which the file contents should be written. The file's
+// contents must be written to the io.Writer before the next call to [Writer.Create],
+// [Writer.CreateHeader], [Writer.CreateRaw], or [Writer.Close].
+//
+// In contrast to [Writer.CreateHeader], the bytes passed to Writer are not compressed.
+func (w *Writer) CreateRaw(fh *FileHeader) (io.Writer, error) {
+	if err := w.prepare(fh); err != nil {
+		return nil, err
+	}
+
+	fh.CompressedSize = uint32(min(fh.CompressedSize64, uint32max))
+	fh.UncompressedSize = uint32(min(fh.UncompressedSize64, uint32max))
+
+	h := &header{
+		FileHeader: fh,
+		offset:     uint64(w.cw.count),
+		raw:        true,
+	}
+	w.dir = append(w.dir, h)
+	if err := writeHeader(w.cw, h); err != nil {
+		return nil, err
+	}
+
+	if strings.HasSuffix(fh.Name, "/") {
+		w.last = nil
+		return dirWriter{}, nil
+	}
+
+	fw := &fileWriter{
+		header: h,
+		zipw:   w.cw,
+	}
+	w.last = fw
+	return fw, nil
+}
+
+// Copy copies the file f (obtained from a [Reader]) into w. It copies the raw
+// form directly bypassing decompression, compression, and validation.
+func (w *Writer) Copy(f *File) error {
+	r, err := f.OpenRaw()
+	if err != nil {
+		return err
+	}
+	fw, err := w.CreateRaw(&f.FileHeader)
+	if err != nil {
+		return err
+	}
+	_, err = io.Copy(fw, r)
+	return err
+}
+
+// RegisterCompressor registers or overrides a custom compressor for a specific
+// method ID. If a compressor for a given method is not found, [Writer] will
+// default to looking up the compressor at the package level.
+func (w *Writer) RegisterCompressor(method uint16, comp Compressor) {
+	if w.compressors == nil {
+		w.compressors = make(map[uint16]Compressor)
+	}
+	w.compressors[method] = comp
+}
+
+// AddFS adds the files from fs.FS to the archive.
+// It walks the directory tree starting at the root of the filesystem
+// adding each file to the zip using deflate while maintaining the directory structure.
+func (w *Writer) AddFS(fsys fs.FS) error {
+	return fs.WalkDir(fsys, ".", func(name string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		if d.IsDir() {
+			return nil
+		}
+		info, err := d.Info()
+		if err != nil {
+			return err
+		}
+		if !info.Mode().IsRegular() {
+			return errors.New("zip: cannot add non-regular file")
+		}
+		h, err := FileInfoHeader(info)
+		if err != nil {
+			return err
+		}
+		h.Name = name
+		h.Method = Deflate
+		fw, err := w.CreateHeader(h)
+		if err != nil {
+			return err
+		}
+		f, err := fsys.Open(name)
+		if err != nil {
+			return err
+		}
+		defer f.Close()
+		_, err = io.Copy(fw, f)
+		return err
+	})
+}
+
+func (w *Writer) compressor(method uint16) Compressor {
+	comp := w.compressors[method]
+	if comp == nil {
+		comp = compressor(method)
+	}
+	return comp
+}
+
+type dirWriter struct{}
+
+func (dirWriter) Write(b []byte) (int, error) {
+	if len(b) == 0 {
+		return 0, nil
+	}
+	return 0, errors.New("zip: write to directory")
+}
+
+type fileWriter struct {
+	*header
+	zipw      io.Writer
+	rawCount  *countWriter
+	comp      io.WriteCloser
+	compCount *countWriter
+	crc32     hash.Hash32
+	closed    bool
+}
+
+func (w *fileWriter) Write(p []byte) (int, error) {
+	if w.closed {
+		return 0, errors.New("zip: write to closed file")
+	}
+	if w.raw {
+		return w.zipw.Write(p)
+	}
+	w.crc32.Write(p)
+	return w.rawCount.Write(p)
+}
+
+func (w *fileWriter) close() error {
+	if w.closed {
+		return errors.New("zip: file closed twice")
+	}
+	w.closed = true
+	if w.raw {
+		return w.writeDataDescriptor()
+	}
+	if err := w.comp.Close(); err != nil {
+		return err
+	}
+
+	// update FileHeader
+	fh := w.header.FileHeader
+	fh.CRC32 = w.crc32.Sum32()
+	fh.CompressedSize64 = uint64(w.compCount.count)
+	fh.UncompressedSize64 = uint64(w.rawCount.count)
+
+	if fh.isZip64() {
+		fh.CompressedSize = uint32max
+		fh.UncompressedSize = uint32max
+		fh.ReaderVersion = zipVersion45 // requires 4.5 - File uses ZIP64 format extensions
+	} else {
+		fh.CompressedSize = uint32(fh.CompressedSize64)
+		fh.UncompressedSize = uint32(fh.UncompressedSize64)
+	}
+
+	return w.writeDataDescriptor()
+}
+
+func (w *fileWriter) writeDataDescriptor() error {
+	if !w.hasDataDescriptor() {
+		return nil
+	}
+	// Write data descriptor. This is more complicated than one would
+	// think, see e.g. comments in zipfile.c:putextended() and
+	// http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7073588.
+	// The approach here is to write 8 byte sizes if needed without
+	// adding a zip64 extra in the local header (too late anyway).
+	var buf []byte
+	if w.isZip64() {
+		buf = make([]byte, dataDescriptor64Len)
+	} else {
+		buf = make([]byte, dataDescriptorLen)
+	}
+	b := writeBuf(buf)
+	b.uint32(dataDescriptorSignature) // de-facto standard, required by OS X
+	b.uint32(w.CRC32)
+	if w.isZip64() {
+		b.uint64(w.CompressedSize64)
+		b.uint64(w.UncompressedSize64)
+	} else {
+		b.uint32(w.CompressedSize)
+		b.uint32(w.UncompressedSize)
+	}
+	_, err := w.zipw.Write(buf)
+	return err
+}
+
+type countWriter struct {
+	w     io.Writer
+	count int64
+}
+
+func (w *countWriter) Write(p []byte) (int, error) {
+	n, err := w.w.Write(p)
+	w.count += int64(n)
+	return n, err
+}
+
+type nopCloser struct {
+	io.Writer
+}
+
+func (w nopCloser) Close() error {
+	return nil
+}
+
+type writeBuf []byte
+
+func (b *writeBuf) uint8(v uint8) {
+	(*b)[0] = v
+	*b = (*b)[1:]
+}
+
+func (b *writeBuf) uint16(v uint16) {
+	binary.LittleEndian.PutUint16(*b, v)
+	*b = (*b)[2:]
+}
+
+func (b *writeBuf) uint32(v uint32) {
+	binary.LittleEndian.PutUint32(*b, v)
+	*b = (*b)[4:]
+}
+
+func (b *writeBuf) uint64(v uint64) {
+	binary.LittleEndian.PutUint64(*b, v)
+	*b = (*b)[8:]
+}
diff --git a/vendor/github.com/andybalholm/brotli/README.md b/vendor/github.com/andybalholm/brotli/README.md
index 1ea7fdb759..00625211d7 100644
--- a/vendor/github.com/andybalholm/brotli/README.md
+++ b/vendor/github.com/andybalholm/brotli/README.md
@@ -2,6 +2,13 @@ This package is a brotli compressor and decompressor implemented in Go.
 It was translated from the reference implementation (https://github.com/google/brotli)
 with the `c2go` tool at https://github.com/andybalholm/c2go.
 
+I have been working on new compression algorithms (not translated from C)
+in the matchfinder package.
+You can use them with the NewWriterV2 function.
+Currently they give better results than the old implementation
+(at least for compressing my test file, Newton’s *Opticks*) 
+on levels 2 to 6.
+
 I am using it in production with https://github.com/andybalholm/redwood.
 
 API documentation is found at https://pkg.go.dev/github.com/andybalholm/brotli?tab=doc.
diff --git a/vendor/github.com/andybalholm/brotli/bitwriter.go b/vendor/github.com/andybalholm/brotli/bitwriter.go
new file mode 100644
index 0000000000..dfc60360f3
--- /dev/null
+++ b/vendor/github.com/andybalholm/brotli/bitwriter.go
@@ -0,0 +1,56 @@
+package brotli
+
+/* Copyright 2010 Google Inc. All Rights Reserved.
+
+   Distributed under MIT license.
+   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
+*/
+
+/* Write bits into a byte array. */
+
+type bitWriter struct {
+	dst []byte
+
+	// Data waiting to be written is the low nbits of bits.
+	bits  uint64
+	nbits uint
+}
+
+func (w *bitWriter) writeBits(nb uint, b uint64) {
+	w.bits |= b << w.nbits
+	w.nbits += nb
+	if w.nbits >= 32 {
+		bits := w.bits
+		w.bits >>= 32
+		w.nbits -= 32
+		w.dst = append(w.dst,
+			byte(bits),
+			byte(bits>>8),
+			byte(bits>>16),
+			byte(bits>>24),
+		)
+	}
+}
+
+func (w *bitWriter) writeSingleBit(bit bool) {
+	if bit {
+		w.writeBits(1, 1)
+	} else {
+		w.writeBits(1, 0)
+	}
+}
+
+func (w *bitWriter) jumpToByteBoundary() {
+	dst := w.dst
+	for w.nbits != 0 {
+		dst = append(dst, byte(w.bits))
+		w.bits >>= 8
+		if w.nbits > 8 { // Avoid underflow
+			w.nbits -= 8
+		} else {
+			w.nbits = 0
+		}
+	}
+	w.bits = 0
+	w.dst = dst
+}
diff --git a/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go b/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go
index 2470f84e4b..ee6552982b 100644
--- a/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go
+++ b/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go
@@ -7,12 +7,18 @@ import (
 
 const maxHuffmanTreeSize = (2*numCommandSymbols + 1)
 
-/* The maximum size of Huffman dictionary for distances assuming that
-   NPOSTFIX = 0 and NDIRECT = 0. */
+/*
+The maximum size of Huffman dictionary for distances assuming that
+
+	NPOSTFIX = 0 and NDIRECT = 0.
+*/
 const maxSimpleDistanceAlphabetSize = 140
 
-/* Represents the range of values belonging to a prefix code:
-   [offset, offset + 2^nbits) */
+/*
+Represents the range of values belonging to a prefix code:
+
+	[offset, offset + 2^nbits)
+*/
 type prefixCodeRange struct {
 	offset uint32
 	nbits  uint32
@@ -96,9 +102,12 @@ func nextBlockTypeCode(calculator *blockTypeCodeCalculator, type_ byte) uint {
 	return type_code
 }
 
-/* |nibblesbits| represents the 2 bits to encode MNIBBLES (0-3)
-   REQUIRES: length > 0
-   REQUIRES: length <= (1 << 24) */
+/*
+|nibblesbits| represents the 2 bits to encode MNIBBLES (0-3)
+
+	REQUIRES: length > 0
+	REQUIRES: length <= (1 << 24)
+*/
 func encodeMlen(length uint, bits *uint64, numbits *uint, nibblesbits *uint64) {
 	var lg uint
 	if length == 1 {
@@ -121,7 +130,7 @@ func encodeMlen(length uint, bits *uint64, numbits *uint, nibblesbits *uint64) {
 	*bits = uint64(length) - 1
 }
 
-func storeCommandExtra(cmd *command, bw *bitWriter) {
+func storeCommandExtra(cmd *command, storage_ix *uint, storage []byte) {
 	var copylen_code uint32 = commandCopyLenCode(cmd)
 	var inscode uint16 = getInsertLengthCode(uint(cmd.insert_len_))
 	var copycode uint16 = getCopyLengthCode(uint(copylen_code))
@@ -129,11 +138,14 @@ func storeCommandExtra(cmd *command, bw *bitWriter) {
 	var insextraval uint64 = uint64(cmd.insert_len_) - uint64(getInsertBase(inscode))
 	var copyextraval uint64 = uint64(copylen_code) - uint64(getCopyBase(copycode))
 	var bits uint64 = copyextraval<<insnumextra | insextraval
-	bw.writeBits(uint(insnumextra+getCopyExtra(copycode)), bits)
+	writeBits(uint(insnumextra+getCopyExtra(copycode)), bits, storage_ix, storage)
 }
 
-/* Data structure that stores almost everything that is needed to encode each
-   block switch command. */
+/*
+Data structure that stores almost everything that is needed to encode each
+
+	block switch command.
+*/
 type blockSplitCode struct {
 	type_code_calculator blockTypeCodeCalculator
 	type_depths          [maxBlockTypeSymbols]byte
@@ -143,21 +155,24 @@ type blockSplitCode struct {
 }
 
 /* Stores a number between 0 and 255. */
-func storeVarLenUint8(n uint, bw *bitWriter) {
+func storeVarLenUint8(n uint, storage_ix *uint, storage []byte) {
 	if n == 0 {
-		bw.writeBits(1, 0)
+		writeBits(1, 0, storage_ix, storage)
 	} else {
 		var nbits uint = uint(log2FloorNonZero(n))
-		bw.writeBits(1, 1)
-		bw.writeBits(3, uint64(nbits))
-		bw.writeBits(nbits, uint64(n)-(uint64(uint(1))<<nbits))
+		writeBits(1, 1, storage_ix, storage)
+		writeBits(3, uint64(nbits), storage_ix, storage)
+		writeBits(nbits, uint64(n)-(uint64(uint(1))<<nbits), storage_ix, storage)
 	}
 }
 
-/* Stores the compressed meta-block header.
-   REQUIRES: length > 0
-   REQUIRES: length <= (1 << 24) */
-func storeCompressedMetaBlockHeader(is_final_block bool, length uint, bw *bitWriter) {
+/*
+Stores the compressed meta-block header.
+
+	REQUIRES: length > 0
+	REQUIRES: length <= (1 << 24)
+*/
+func storeCompressedMetaBlockHeader(is_final_block bool, length uint, storage_ix *uint, storage []byte) {
 	var lenbits uint64
 	var nlenbits uint
 	var nibblesbits uint64
@@ -169,41 +184,44 @@ func storeCompressedMetaBlockHeader(is_final_block bool, length uint, bw *bitWri
 	}
 
 	/* Write ISLAST bit. */
-	bw.writeBits(1, is_final)
+	writeBits(1, is_final, storage_ix, storage)
 
 	/* Write ISEMPTY bit. */
 	if is_final_block {
-		bw.writeBits(1, 0)
+		writeBits(1, 0, storage_ix, storage)
 	}
 
 	encodeMlen(length, &lenbits, &nlenbits, &nibblesbits)
-	bw.writeBits(2, nibblesbits)
-	bw.writeBits(nlenbits, lenbits)
+	writeBits(2, nibblesbits, storage_ix, storage)
+	writeBits(nlenbits, lenbits, storage_ix, storage)
 
 	if !is_final_block {
 		/* Write ISUNCOMPRESSED bit. */
-		bw.writeBits(1, 0)
+		writeBits(1, 0, storage_ix, storage)
 	}
 }
 
-/* Stores the uncompressed meta-block header.
-   REQUIRES: length > 0
-   REQUIRES: length <= (1 << 24) */
-func storeUncompressedMetaBlockHeader(length uint, bw *bitWriter) {
+/*
+Stores the uncompressed meta-block header.
+
+	REQUIRES: length > 0
+	REQUIRES: length <= (1 << 24)
+*/
+func storeUncompressedMetaBlockHeader(length uint, storage_ix *uint, storage []byte) {
 	var lenbits uint64
 	var nlenbits uint
 	var nibblesbits uint64
 
 	/* Write ISLAST bit.
 	   Uncompressed block cannot be the last one, so set to 0. */
-	bw.writeBits(1, 0)
+	writeBits(1, 0, storage_ix, storage)
 
 	encodeMlen(length, &lenbits, &nlenbits, &nibblesbits)
-	bw.writeBits(2, nibblesbits)
-	bw.writeBits(nlenbits, lenbits)
+	writeBits(2, nibblesbits, storage_ix, storage)
+	writeBits(nlenbits, lenbits, storage_ix, storage)
 
 	/* Write ISUNCOMPRESSED bit. */
-	bw.writeBits(1, 1)
+	writeBits(1, 1, storage_ix, storage)
 }
 
 var storeHuffmanTreeOfHuffmanTreeToBitMask_kStorageOrder = [codeLengthCodes]byte{1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15}
@@ -211,7 +229,7 @@ var storeHuffmanTreeOfHuffmanTreeToBitMask_kStorageOrder = [codeLengthCodes]byte
 var storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeSymbols = [6]byte{0, 7, 3, 2, 1, 15}
 var storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeBitLengths = [6]byte{2, 4, 3, 2, 2, 4}
 
-func storeHuffmanTreeOfHuffmanTreeToBitMask(num_codes int, code_length_bitdepth []byte, bw *bitWriter) {
+func storeHuffmanTreeOfHuffmanTreeToBitMask(num_codes int, code_length_bitdepth []byte, storage_ix *uint, storage []byte) {
 	var skip_some uint = 0
 	var codes_to_store uint = codeLengthCodes
 	/* The bit lengths of the Huffman code over the code length alphabet
@@ -241,38 +259,38 @@ func storeHuffmanTreeOfHuffmanTreeToBitMask(num_codes int, code_length_bitdepth
 		}
 	}
 
-	bw.writeBits(2, uint64(skip_some))
+	writeBits(2, uint64(skip_some), storage_ix, storage)
 	{
 		var i uint
 		for i = skip_some; i < codes_to_store; i++ {
 			var l uint = uint(code_length_bitdepth[storeHuffmanTreeOfHuffmanTreeToBitMask_kStorageOrder[i]])
-			bw.writeBits(uint(storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeBitLengths[l]), uint64(storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeSymbols[l]))
+			writeBits(uint(storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeBitLengths[l]), uint64(storeHuffmanTreeOfHuffmanTreeToBitMask_kHuffmanBitLengthHuffmanCodeSymbols[l]), storage_ix, storage)
 		}
 	}
 }
 
-func storeHuffmanTreeToBitMask(huffman_tree_size uint, huffman_tree []byte, huffman_tree_extra_bits []byte, code_length_bitdepth []byte, code_length_bitdepth_symbols []uint16, bw *bitWriter) {
+func storeHuffmanTreeToBitMask(huffman_tree_size uint, huffman_tree []byte, huffman_tree_extra_bits []byte, code_length_bitdepth []byte, code_length_bitdepth_symbols []uint16, storage_ix *uint, storage []byte) {
 	var i uint
 	for i = 0; i < huffman_tree_size; i++ {
 		var ix uint = uint(huffman_tree[i])
-		bw.writeBits(uint(code_length_bitdepth[ix]), uint64(code_length_bitdepth_symbols[ix]))
+		writeBits(uint(code_length_bitdepth[ix]), uint64(code_length_bitdepth_symbols[ix]), storage_ix, storage)
 
 		/* Extra bits */
 		switch ix {
 		case repeatPreviousCodeLength:
-			bw.writeBits(2, uint64(huffman_tree_extra_bits[i]))
+			writeBits(2, uint64(huffman_tree_extra_bits[i]), storage_ix, storage)
 
 		case repeatZeroCodeLength:
-			bw.writeBits(3, uint64(huffman_tree_extra_bits[i]))
+			writeBits(3, uint64(huffman_tree_extra_bits[i]), storage_ix, storage)
 		}
 	}
 }
 
-func storeSimpleHuffmanTree(depths []byte, symbols []uint, num_symbols uint, max_bits uint, bw *bitWriter) {
+func storeSimpleHuffmanTree(depths []byte, symbols []uint, num_symbols uint, max_bits uint, storage_ix *uint, storage []byte) {
 	/* value of 1 indicates a simple Huffman code */
-	bw.writeBits(2, 1)
+	writeBits(2, 1, storage_ix, storage)
 
-	bw.writeBits(2, uint64(num_symbols)-1) /* NSYM - 1 */
+	writeBits(2, uint64(num_symbols)-1, storage_ix, storage) /* NSYM - 1 */
 	{
 		/* Sort */
 		var i uint
@@ -289,17 +307,17 @@ func storeSimpleHuffmanTree(depths []byte, symbols []uint, num_symbols uint, max
 	}
 
 	if num_symbols == 2 {
-		bw.writeBits(max_bits, uint64(symbols[0]))
-		bw.writeBits(max_bits, uint64(symbols[1]))
+		writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
 	} else if num_symbols == 3 {
-		bw.writeBits(max_bits, uint64(symbols[0]))
-		bw.writeBits(max_bits, uint64(symbols[1]))
-		bw.writeBits(max_bits, uint64(symbols[2]))
+		writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[2]), storage_ix, storage)
 	} else {
-		bw.writeBits(max_bits, uint64(symbols[0]))
-		bw.writeBits(max_bits, uint64(symbols[1]))
-		bw.writeBits(max_bits, uint64(symbols[2]))
-		bw.writeBits(max_bits, uint64(symbols[3]))
+		writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[2]), storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[3]), storage_ix, storage)
 
 		/* tree-select */
 		var tmp int
@@ -308,13 +326,16 @@ func storeSimpleHuffmanTree(depths []byte, symbols []uint, num_symbols uint, max
 		} else {
 			tmp = 0
 		}
-		bw.writeBits(1, uint64(tmp))
+		writeBits(1, uint64(tmp), storage_ix, storage)
 	}
 }
 
-/* num = alphabet size
-   depths = symbol depths */
-func storeHuffmanTree(depths []byte, num uint, tree []huffmanTree, bw *bitWriter) {
+/*
+num = alphabet size
+
+	depths = symbol depths
+*/
+func storeHuffmanTree(depths []byte, num uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var huffman_tree [numCommandSymbols]byte
 	var huffman_tree_extra_bits [numCommandSymbols]byte
 	var huffman_tree_size uint = 0
@@ -357,19 +378,22 @@ func storeHuffmanTree(depths []byte, num uint, tree []huffmanTree, bw *bitWriter
 	convertBitDepthsToSymbols(code_length_bitdepth[:], codeLengthCodes, code_length_bitdepth_symbols[:])
 
 	/* Now, we have all the data, let's start storing it */
-	storeHuffmanTreeOfHuffmanTreeToBitMask(num_codes, code_length_bitdepth[:], bw)
+	storeHuffmanTreeOfHuffmanTreeToBitMask(num_codes, code_length_bitdepth[:], storage_ix, storage)
 
 	if num_codes == 1 {
 		code_length_bitdepth[code] = 0
 	}
 
 	/* Store the real Huffman tree now. */
-	storeHuffmanTreeToBitMask(huffman_tree_size, huffman_tree[:], huffman_tree_extra_bits[:], code_length_bitdepth[:], code_length_bitdepth_symbols[:], bw)
+	storeHuffmanTreeToBitMask(huffman_tree_size, huffman_tree[:], huffman_tree_extra_bits[:], code_length_bitdepth[:], code_length_bitdepth_symbols[:], storage_ix, storage)
 }
 
-/* Builds a Huffman tree from histogram[0:length] into depth[0:length] and
-   bits[0:length] and stores the encoded tree to the bit stream. */
-func buildAndStoreHuffmanTree(histogram []uint32, histogram_length uint, alphabet_size uint, tree []huffmanTree, depth []byte, bits []uint16, bw *bitWriter) {
+/*
+Builds a Huffman tree from histogram[0:length] into depth[0:length] and
+
+	bits[0:length] and stores the encoded tree to the bit stream.
+*/
+func buildAndStoreHuffmanTree(histogram []uint32, histogram_length uint, alphabet_size uint, tree []huffmanTree, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var count uint = 0
 	var s4 = [4]uint{0}
 	var i uint
@@ -394,8 +418,8 @@ func buildAndStoreHuffmanTree(histogram []uint32, histogram_length uint, alphabe
 	}
 
 	if count <= 1 {
-		bw.writeBits(4, 1)
-		bw.writeBits(max_bits, uint64(s4[0]))
+		writeBits(4, 1, storage_ix, storage)
+		writeBits(max_bits, uint64(s4[0]), storage_ix, storage)
 		depth[s4[0]] = 0
 		bits[s4[0]] = 0
 		return
@@ -408,9 +432,9 @@ func buildAndStoreHuffmanTree(histogram []uint32, histogram_length uint, alphabe
 	convertBitDepthsToSymbols(depth, histogram_length, bits)
 
 	if count <= 4 {
-		storeSimpleHuffmanTree(depth, s4[:], count, max_bits, bw)
+		storeSimpleHuffmanTree(depth, s4[:], count, max_bits, storage_ix, storage)
 	} else {
-		storeHuffmanTree(depth, histogram_length, tree, bw)
+		storeHuffmanTree(depth, histogram_length, tree, storage_ix, storage)
 	}
 }
 
@@ -420,7 +444,210 @@ func sortHuffmanTree1(v0 huffmanTree, v1 huffmanTree) bool {
 
 var huffmanTreePool sync.Pool
 
-func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_bits uint, depth []byte, bits []uint16, bw *bitWriter) {
+func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_bits uint, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
+	var count uint = 0
+	var symbols = [4]uint{0}
+	var length uint = 0
+	var total uint = histogram_total
+	for total != 0 {
+		if histogram[length] != 0 {
+			if count < 4 {
+				symbols[count] = length
+			}
+
+			count++
+			total -= uint(histogram[length])
+		}
+
+		length++
+	}
+
+	if count <= 1 {
+		writeBits(4, 1, storage_ix, storage)
+		writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+		depth[symbols[0]] = 0
+		bits[symbols[0]] = 0
+		return
+	}
+
+	for i := 0; i < int(length); i++ {
+		depth[i] = 0
+	}
+	{
+		var max_tree_size uint = 2*length + 1
+		tree, _ := huffmanTreePool.Get().(*[]huffmanTree)
+		if tree == nil || cap(*tree) < int(max_tree_size) {
+			tmp := make([]huffmanTree, max_tree_size)
+			tree = &tmp
+		} else {
+			*tree = (*tree)[:max_tree_size]
+		}
+		var count_limit uint32
+		for count_limit = 1; ; count_limit *= 2 {
+			var node int = 0
+			var l uint
+			for l = length; l != 0; {
+				l--
+				if histogram[l] != 0 {
+					if histogram[l] >= count_limit {
+						initHuffmanTree(&(*tree)[node:][0], histogram[l], -1, int16(l))
+					} else {
+						initHuffmanTree(&(*tree)[node:][0], count_limit, -1, int16(l))
+					}
+
+					node++
+				}
+			}
+			{
+				var n int = node
+				/* Points to the next leaf node. */ /* Points to the next non-leaf node. */
+				var sentinel huffmanTree
+				var i int = 0
+				var j int = n + 1
+				var k int
+
+				sortHuffmanTreeItems(*tree, uint(n), huffmanTreeComparator(sortHuffmanTree1))
+
+				/* The nodes are:
+				   [0, n): the sorted leaf nodes that we start with.
+				   [n]: we add a sentinel here.
+				   [n + 1, 2n): new parent nodes are added here, starting from
+				                (n+1). These are naturally in ascending order.
+				   [2n]: we add a sentinel at the end as well.
+				   There will be (2n+1) elements at the end. */
+				initHuffmanTree(&sentinel, math.MaxUint32, -1, -1)
+
+				(*tree)[node] = sentinel
+				node++
+				(*tree)[node] = sentinel
+				node++
+
+				for k = n - 1; k > 0; k-- {
+					var left int
+					var right int
+					if (*tree)[i].total_count_ <= (*tree)[j].total_count_ {
+						left = i
+						i++
+					} else {
+						left = j
+						j++
+					}
+
+					if (*tree)[i].total_count_ <= (*tree)[j].total_count_ {
+						right = i
+						i++
+					} else {
+						right = j
+						j++
+					}
+
+					/* The sentinel node becomes the parent node. */
+					(*tree)[node-1].total_count_ = (*tree)[left].total_count_ + (*tree)[right].total_count_
+
+					(*tree)[node-1].index_left_ = int16(left)
+					(*tree)[node-1].index_right_or_value_ = int16(right)
+
+					/* Add back the last sentinel node. */
+					(*tree)[node] = sentinel
+					node++
+				}
+
+				if setDepth(2*n-1, *tree, depth, 14) {
+					/* We need to pack the Huffman tree in 14 bits. If this was not
+					   successful, add fake entities to the lowest values and retry. */
+					break
+				}
+			}
+		}
+
+		huffmanTreePool.Put(tree)
+	}
+
+	convertBitDepthsToSymbols(depth, length, bits)
+	if count <= 4 {
+		var i uint
+
+		/* value of 1 indicates a simple Huffman code */
+		writeBits(2, 1, storage_ix, storage)
+
+		writeBits(2, uint64(count)-1, storage_ix, storage) /* NSYM - 1 */
+
+		/* Sort */
+		for i = 0; i < count; i++ {
+			var j uint
+			for j = i + 1; j < count; j++ {
+				if depth[symbols[j]] < depth[symbols[i]] {
+					var tmp uint = symbols[j]
+					symbols[j] = symbols[i]
+					symbols[i] = tmp
+				}
+			}
+		}
+
+		if count == 2 {
+			writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
+		} else if count == 3 {
+			writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[2]), storage_ix, storage)
+		} else {
+			writeBits(max_bits, uint64(symbols[0]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[1]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[2]), storage_ix, storage)
+			writeBits(max_bits, uint64(symbols[3]), storage_ix, storage)
+
+			/* tree-select */
+			var tmp int
+			if depth[symbols[0]] == 1 {
+				tmp = 1
+			} else {
+				tmp = 0
+			}
+			writeBits(1, uint64(tmp), storage_ix, storage)
+		}
+	} else {
+		var previous_value byte = 8
+		var i uint
+
+		/* Complex Huffman Tree */
+		storeStaticCodeLengthCode(storage_ix, storage)
+
+		/* Actual RLE coding. */
+		for i = 0; i < length; {
+			var value byte = depth[i]
+			var reps uint = 1
+			var k uint
+			for k = i + 1; k < length && depth[k] == value; k++ {
+				reps++
+			}
+
+			i += reps
+			if value == 0 {
+				writeBits(uint(kZeroRepsDepth[reps]), kZeroRepsBits[reps], storage_ix, storage)
+			} else {
+				if previous_value != value {
+					writeBits(uint(kCodeLengthDepth[value]), uint64(kCodeLengthBits[value]), storage_ix, storage)
+					reps--
+				}
+
+				if reps < 3 {
+					for reps != 0 {
+						reps--
+						writeBits(uint(kCodeLengthDepth[value]), uint64(kCodeLengthBits[value]), storage_ix, storage)
+					}
+				} else {
+					reps -= 3
+					writeBits(uint(kNonZeroRepsDepth[reps]), kNonZeroRepsBits[reps], storage_ix, storage)
+				}
+
+				previous_value = value
+			}
+		}
+	}
+}
+
+func buildAndStoreHuffmanTreeFastBW(histogram []uint32, histogram_total uint, max_bits uint, depth []byte, bits []uint16, bw *bitWriter) {
 	var count uint = 0
 	var symbols = [4]uint{0}
 	var length uint = 0
@@ -581,7 +808,7 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 		var i uint
 
 		/* Complex Huffman Tree */
-		storeStaticCodeLengthCode(bw)
+		storeStaticCodeLengthCodeBW(bw)
 
 		/* Actual RLE coding. */
 		for i = 0; i < length; {
@@ -668,12 +895,15 @@ func moveToFrontTransform(v_in []uint32, v_size uint, v_out []uint32) {
 	}
 }
 
-/* Finds runs of zeros in v[0..in_size) and replaces them with a prefix code of
-   the run length plus extra bits (lower 9 bits is the prefix code and the rest
-   are the extra bits). Non-zero values in v[] are shifted by
-   *max_length_prefix. Will not create prefix codes bigger than the initial
-   value of *max_run_length_prefix. The prefix code of run length L is simply
-   Log2Floor(L) and the number of extra bits is the same as the prefix code. */
+/*
+Finds runs of zeros in v[0..in_size) and replaces them with a prefix code of
+
+	the run length plus extra bits (lower 9 bits is the prefix code and the rest
+	are the extra bits). Non-zero values in v[] are shifted by
+	*max_length_prefix. Will not create prefix codes bigger than the initial
+	value of *max_run_length_prefix. The prefix code of run length L is simply
+	Log2Floor(L) and the number of extra bits is the same as the prefix code.
+*/
 func runLengthCodeZeros(in_size uint, v []uint32, out_size *uint, max_run_length_prefix *uint32) {
 	var max_reps uint32 = 0
 	var i uint
@@ -733,7 +963,7 @@ const symbolBits = 9
 
 var encodeContextMap_kSymbolMask uint32 = (1 << symbolBits) - 1
 
-func encodeContextMap(context_map []uint32, context_map_size uint, num_clusters uint, tree []huffmanTree, bw *bitWriter) {
+func encodeContextMap(context_map []uint32, context_map_size uint, num_clusters uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var i uint
 	var rle_symbols []uint32
 	var max_run_length_prefix uint32 = 6
@@ -742,7 +972,7 @@ func encodeContextMap(context_map []uint32, context_map_size uint, num_clusters
 	var depths [maxContextMapSymbols]byte
 	var bits [maxContextMapSymbols]uint16
 
-	storeVarLenUint8(num_clusters-1, bw)
+	storeVarLenUint8(num_clusters-1, storage_ix, storage)
 
 	if num_clusters == 1 {
 		return
@@ -757,45 +987,48 @@ func encodeContextMap(context_map []uint32, context_map_size uint, num_clusters
 	}
 	{
 		var use_rle bool = (max_run_length_prefix > 0)
-		bw.writeSingleBit(use_rle)
+		writeSingleBit(use_rle, storage_ix, storage)
 		if use_rle {
-			bw.writeBits(4, uint64(max_run_length_prefix)-1)
+			writeBits(4, uint64(max_run_length_prefix)-1, storage_ix, storage)
 		}
 	}
 
-	buildAndStoreHuffmanTree(histogram[:], uint(uint32(num_clusters)+max_run_length_prefix), uint(uint32(num_clusters)+max_run_length_prefix), tree, depths[:], bits[:], bw)
+	buildAndStoreHuffmanTree(histogram[:], uint(uint32(num_clusters)+max_run_length_prefix), uint(uint32(num_clusters)+max_run_length_prefix), tree, depths[:], bits[:], storage_ix, storage)
 	for i = 0; i < num_rle_symbols; i++ {
 		var rle_symbol uint32 = rle_symbols[i] & encodeContextMap_kSymbolMask
 		var extra_bits_val uint32 = rle_symbols[i] >> symbolBits
-		bw.writeBits(uint(depths[rle_symbol]), uint64(bits[rle_symbol]))
+		writeBits(uint(depths[rle_symbol]), uint64(bits[rle_symbol]), storage_ix, storage)
 		if rle_symbol > 0 && rle_symbol <= max_run_length_prefix {
-			bw.writeBits(uint(rle_symbol), uint64(extra_bits_val))
+			writeBits(uint(rle_symbol), uint64(extra_bits_val), storage_ix, storage)
 		}
 	}
 
-	bw.writeBits(1, 1) /* use move-to-front */
+	writeBits(1, 1, storage_ix, storage) /* use move-to-front */
 	rle_symbols = nil
 }
 
 /* Stores the block switch command with index block_ix to the bit stream. */
-func storeBlockSwitch(code *blockSplitCode, block_len uint32, block_type byte, is_first_block bool, bw *bitWriter) {
+func storeBlockSwitch(code *blockSplitCode, block_len uint32, block_type byte, is_first_block bool, storage_ix *uint, storage []byte) {
 	var typecode uint = nextBlockTypeCode(&code.type_code_calculator, block_type)
 	var lencode uint
 	var len_nextra uint32
 	var len_extra uint32
 	if !is_first_block {
-		bw.writeBits(uint(code.type_depths[typecode]), uint64(code.type_bits[typecode]))
+		writeBits(uint(code.type_depths[typecode]), uint64(code.type_bits[typecode]), storage_ix, storage)
 	}
 
 	getBlockLengthPrefixCode(block_len, &lencode, &len_nextra, &len_extra)
 
-	bw.writeBits(uint(code.length_depths[lencode]), uint64(code.length_bits[lencode]))
-	bw.writeBits(uint(len_nextra), uint64(len_extra))
+	writeBits(uint(code.length_depths[lencode]), uint64(code.length_bits[lencode]), storage_ix, storage)
+	writeBits(uint(len_nextra), uint64(len_extra), storage_ix, storage)
 }
 
-/* Builds a BlockSplitCode data structure from the block split given by the
-   vector of block types and block lengths and stores it to the bit stream. */
-func buildAndStoreBlockSplitCode(types []byte, lengths []uint32, num_blocks uint, num_types uint, tree []huffmanTree, code *blockSplitCode, bw *bitWriter) {
+/*
+Builds a BlockSplitCode data structure from the block split given by the
+
+	vector of block types and block lengths and stores it to the bit stream.
+*/
+func buildAndStoreBlockSplitCode(types []byte, lengths []uint32, num_blocks uint, num_types uint, tree []huffmanTree, code *blockSplitCode, storage_ix *uint, storage []byte) {
 	var type_histo [maxBlockTypeSymbols]uint32
 	var length_histo [numBlockLenSymbols]uint32
 	var i uint
@@ -813,17 +1046,17 @@ func buildAndStoreBlockSplitCode(types []byte, lengths []uint32, num_blocks uint
 		length_histo[blockLengthPrefixCode(lengths[i])]++
 	}
 
-	storeVarLenUint8(num_types-1, bw)
+	storeVarLenUint8(num_types-1, storage_ix, storage)
 	if num_types > 1 { /* TODO: else? could StoreBlockSwitch occur? */
-		buildAndStoreHuffmanTree(type_histo[0:], num_types+2, num_types+2, tree, code.type_depths[0:], code.type_bits[0:], bw)
-		buildAndStoreHuffmanTree(length_histo[0:], numBlockLenSymbols, numBlockLenSymbols, tree, code.length_depths[0:], code.length_bits[0:], bw)
-		storeBlockSwitch(code, lengths[0], types[0], true, bw)
+		buildAndStoreHuffmanTree(type_histo[0:], num_types+2, num_types+2, tree, code.type_depths[0:], code.type_bits[0:], storage_ix, storage)
+		buildAndStoreHuffmanTree(length_histo[0:], numBlockLenSymbols, numBlockLenSymbols, tree, code.length_depths[0:], code.length_bits[0:], storage_ix, storage)
+		storeBlockSwitch(code, lengths[0], types[0], true, storage_ix, storage)
 	}
 }
 
 /* Stores a context map where the histogram type is always the block type. */
-func storeTrivialContextMap(num_types uint, context_bits uint, tree []huffmanTree, bw *bitWriter) {
-	storeVarLenUint8(num_types-1, bw)
+func storeTrivialContextMap(num_types uint, context_bits uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
+	storeVarLenUint8(num_types-1, storage_ix, storage)
 	if num_types > 1 {
 		var repeat_code uint = context_bits - 1
 		var repeat_bits uint = (1 << repeat_code) - 1
@@ -837,16 +1070,16 @@ func storeTrivialContextMap(num_types uint, context_bits uint, tree []huffmanTre
 		}
 
 		/* Write RLEMAX. */
-		bw.writeBits(1, 1)
+		writeBits(1, 1, storage_ix, storage)
 
-		bw.writeBits(4, uint64(repeat_code)-1)
+		writeBits(4, uint64(repeat_code)-1, storage_ix, storage)
 		histogram[repeat_code] = uint32(num_types)
 		histogram[0] = 1
 		for i = context_bits; i < alphabet_size; i++ {
 			histogram[i] = 1
 		}
 
-		buildAndStoreHuffmanTree(histogram[:], alphabet_size, alphabet_size, tree, depths[:], bits[:], bw)
+		buildAndStoreHuffmanTree(histogram[:], alphabet_size, alphabet_size, tree, depths[:], bits[:], storage_ix, storage)
 		for i = 0; i < num_types; i++ {
 			var tmp uint
 			if i == 0 {
@@ -855,13 +1088,13 @@ func storeTrivialContextMap(num_types uint, context_bits uint, tree []huffmanTre
 				tmp = i + context_bits - 1
 			}
 			var code uint = tmp
-			bw.writeBits(uint(depths[code]), uint64(bits[code]))
-			bw.writeBits(uint(depths[repeat_code]), uint64(bits[repeat_code]))
-			bw.writeBits(repeat_code, uint64(repeat_bits))
+			writeBits(uint(depths[code]), uint64(bits[code]), storage_ix, storage)
+			writeBits(uint(depths[repeat_code]), uint64(bits[repeat_code]), storage_ix, storage)
+			writeBits(repeat_code, uint64(repeat_bits), storage_ix, storage)
 		}
 
 		/* Write IMTF (inverse-move-to-front) bit. */
-		bw.writeBits(1, 1)
+		writeBits(1, 1, storage_ix, storage)
 	}
 }
 
@@ -913,15 +1146,21 @@ func cleanupBlockEncoder(self *blockEncoder) {
 	blockEncoderPool.Put(self)
 }
 
-/* Creates entropy codes of block lengths and block types and stores them
-   to the bit stream. */
-func buildAndStoreBlockSwitchEntropyCodes(self *blockEncoder, tree []huffmanTree, bw *bitWriter) {
-	buildAndStoreBlockSplitCode(self.block_types_, self.block_lengths_, self.num_blocks_, self.num_block_types_, tree, &self.block_split_code_, bw)
+/*
+Creates entropy codes of block lengths and block types and stores them
+
+	to the bit stream.
+*/
+func buildAndStoreBlockSwitchEntropyCodes(self *blockEncoder, tree []huffmanTree, storage_ix *uint, storage []byte) {
+	buildAndStoreBlockSplitCode(self.block_types_, self.block_lengths_, self.num_blocks_, self.num_block_types_, tree, &self.block_split_code_, storage_ix, storage)
 }
 
-/* Stores the next symbol with the entropy code of the current block type.
-   Updates the block type and block length at block boundaries. */
-func storeSymbol(self *blockEncoder, symbol uint, bw *bitWriter) {
+/*
+Stores the next symbol with the entropy code of the current block type.
+
+	Updates the block type and block length at block boundaries.
+*/
+func storeSymbol(self *blockEncoder, symbol uint, storage_ix *uint, storage []byte) {
 	if self.block_len_ == 0 {
 		self.block_ix_++
 		var block_ix uint = self.block_ix_
@@ -929,20 +1168,23 @@ func storeSymbol(self *blockEncoder, symbol uint, bw *bitWriter) {
 		var block_type byte = self.block_types_[block_ix]
 		self.block_len_ = uint(block_len)
 		self.entropy_ix_ = uint(block_type) * self.histogram_length_
-		storeBlockSwitch(&self.block_split_code_, block_len, block_type, false, bw)
+		storeBlockSwitch(&self.block_split_code_, block_len, block_type, false, storage_ix, storage)
 	}
 
 	self.block_len_--
 	{
 		var ix uint = self.entropy_ix_ + symbol
-		bw.writeBits(uint(self.depths_[ix]), uint64(self.bits_[ix]))
+		writeBits(uint(self.depths_[ix]), uint64(self.bits_[ix]), storage_ix, storage)
 	}
 }
 
-/* Stores the next symbol with the entropy code of the current block type and
-   context value.
-   Updates the block type and block length at block boundaries. */
-func storeSymbolWithContext(self *blockEncoder, symbol uint, context uint, context_map []uint32, bw *bitWriter, context_bits uint) {
+/*
+Stores the next symbol with the entropy code of the current block type and
+
+	context value.
+	Updates the block type and block length at block boundaries.
+*/
+func storeSymbolWithContext(self *blockEncoder, symbol uint, context uint, context_map []uint32, storage_ix *uint, storage []byte, context_bits uint) {
 	if self.block_len_ == 0 {
 		self.block_ix_++
 		var block_ix uint = self.block_ix_
@@ -950,18 +1192,18 @@ func storeSymbolWithContext(self *blockEncoder, symbol uint, context uint, conte
 		var block_type byte = self.block_types_[block_ix]
 		self.block_len_ = uint(block_len)
 		self.entropy_ix_ = uint(block_type) << context_bits
-		storeBlockSwitch(&self.block_split_code_, block_len, block_type, false, bw)
+		storeBlockSwitch(&self.block_split_code_, block_len, block_type, false, storage_ix, storage)
 	}
 
 	self.block_len_--
 	{
 		var histo_ix uint = uint(context_map[self.entropy_ix_+context])
 		var ix uint = histo_ix*self.histogram_length_ + symbol
-		bw.writeBits(uint(self.depths_[ix]), uint64(self.bits_[ix]))
+		writeBits(uint(self.depths_[ix]), uint64(self.bits_[ix]), storage_ix, storage)
 	}
 }
 
-func buildAndStoreEntropyCodesLiteral(self *blockEncoder, histograms []histogramLiteral, histograms_size uint, alphabet_size uint, tree []huffmanTree, bw *bitWriter) {
+func buildAndStoreEntropyCodesLiteral(self *blockEncoder, histograms []histogramLiteral, histograms_size uint, alphabet_size uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var table_size uint = histograms_size * self.histogram_length_
 	if cap(self.depths_) < int(table_size) {
 		self.depths_ = make([]byte, table_size)
@@ -977,12 +1219,12 @@ func buildAndStoreEntropyCodesLiteral(self *blockEncoder, histograms []histogram
 		var i uint
 		for i = 0; i < histograms_size; i++ {
 			var ix uint = i * self.histogram_length_
-			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], bw)
+			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], storage_ix, storage)
 		}
 	}
 }
 
-func buildAndStoreEntropyCodesCommand(self *blockEncoder, histograms []histogramCommand, histograms_size uint, alphabet_size uint, tree []huffmanTree, bw *bitWriter) {
+func buildAndStoreEntropyCodesCommand(self *blockEncoder, histograms []histogramCommand, histograms_size uint, alphabet_size uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var table_size uint = histograms_size * self.histogram_length_
 	if cap(self.depths_) < int(table_size) {
 		self.depths_ = make([]byte, table_size)
@@ -998,12 +1240,12 @@ func buildAndStoreEntropyCodesCommand(self *blockEncoder, histograms []histogram
 		var i uint
 		for i = 0; i < histograms_size; i++ {
 			var ix uint = i * self.histogram_length_
-			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], bw)
+			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], storage_ix, storage)
 		}
 	}
 }
 
-func buildAndStoreEntropyCodesDistance(self *blockEncoder, histograms []histogramDistance, histograms_size uint, alphabet_size uint, tree []huffmanTree, bw *bitWriter) {
+func buildAndStoreEntropyCodesDistance(self *blockEncoder, histograms []histogramDistance, histograms_size uint, alphabet_size uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var table_size uint = histograms_size * self.histogram_length_
 	if cap(self.depths_) < int(table_size) {
 		self.depths_ = make([]byte, table_size)
@@ -1019,12 +1261,17 @@ func buildAndStoreEntropyCodesDistance(self *blockEncoder, histograms []histogra
 		var i uint
 		for i = 0; i < histograms_size; i++ {
 			var ix uint = i * self.histogram_length_
-			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], bw)
+			buildAndStoreHuffmanTree(histograms[i].data_[0:], self.histogram_length_, alphabet_size, tree, self.depths_[ix:], self.bits_[ix:], storage_ix, storage)
 		}
 	}
 }
 
-func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_byte byte, prev_byte2 byte, is_last bool, params *encoderParams, literal_context_mode int, commands []command, mb *metaBlockSplit, bw *bitWriter) {
+func jumpToByteBoundary(storage_ix *uint, storage []byte) {
+	*storage_ix = (*storage_ix + 7) &^ 7
+	storage[*storage_ix>>3] = 0
+}
+
+func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_byte byte, prev_byte2 byte, is_last bool, params *encoderParams, literal_context_mode int, commands []command, mb *metaBlockSplit, storage_ix *uint, storage []byte) {
 	var pos uint = start_pos
 	var i uint
 	var num_distance_symbols uint32 = params.dist.alphabet_size
@@ -1036,48 +1283,48 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 		num_effective_distance_symbols = numHistogramDistanceSymbols
 	}
 
-	storeCompressedMetaBlockHeader(is_last, length, bw)
+	storeCompressedMetaBlockHeader(is_last, length, storage_ix, storage)
 
 	tree = make([]huffmanTree, maxHuffmanTreeSize)
 	literal_enc := getBlockEncoder(numLiteralSymbols, mb.literal_split.num_types, mb.literal_split.types, mb.literal_split.lengths, mb.literal_split.num_blocks)
 	command_enc := getBlockEncoder(numCommandSymbols, mb.command_split.num_types, mb.command_split.types, mb.command_split.lengths, mb.command_split.num_blocks)
 	distance_enc := getBlockEncoder(uint(num_effective_distance_symbols), mb.distance_split.num_types, mb.distance_split.types, mb.distance_split.lengths, mb.distance_split.num_blocks)
 
-	buildAndStoreBlockSwitchEntropyCodes(literal_enc, tree, bw)
-	buildAndStoreBlockSwitchEntropyCodes(command_enc, tree, bw)
-	buildAndStoreBlockSwitchEntropyCodes(distance_enc, tree, bw)
+	buildAndStoreBlockSwitchEntropyCodes(literal_enc, tree, storage_ix, storage)
+	buildAndStoreBlockSwitchEntropyCodes(command_enc, tree, storage_ix, storage)
+	buildAndStoreBlockSwitchEntropyCodes(distance_enc, tree, storage_ix, storage)
 
-	bw.writeBits(2, uint64(dist.distance_postfix_bits))
-	bw.writeBits(4, uint64(dist.num_direct_distance_codes)>>dist.distance_postfix_bits)
+	writeBits(2, uint64(dist.distance_postfix_bits), storage_ix, storage)
+	writeBits(4, uint64(dist.num_direct_distance_codes)>>dist.distance_postfix_bits, storage_ix, storage)
 	for i = 0; i < mb.literal_split.num_types; i++ {
-		bw.writeBits(2, uint64(literal_context_mode))
+		writeBits(2, uint64(literal_context_mode), storage_ix, storage)
 	}
 
 	if mb.literal_context_map_size == 0 {
-		storeTrivialContextMap(mb.literal_histograms_size, literalContextBits, tree, bw)
+		storeTrivialContextMap(mb.literal_histograms_size, literalContextBits, tree, storage_ix, storage)
 	} else {
-		encodeContextMap(mb.literal_context_map, mb.literal_context_map_size, mb.literal_histograms_size, tree, bw)
+		encodeContextMap(mb.literal_context_map, mb.literal_context_map_size, mb.literal_histograms_size, tree, storage_ix, storage)
 	}
 
 	if mb.distance_context_map_size == 0 {
-		storeTrivialContextMap(mb.distance_histograms_size, distanceContextBits, tree, bw)
+		storeTrivialContextMap(mb.distance_histograms_size, distanceContextBits, tree, storage_ix, storage)
 	} else {
-		encodeContextMap(mb.distance_context_map, mb.distance_context_map_size, mb.distance_histograms_size, tree, bw)
+		encodeContextMap(mb.distance_context_map, mb.distance_context_map_size, mb.distance_histograms_size, tree, storage_ix, storage)
 	}
 
-	buildAndStoreEntropyCodesLiteral(literal_enc, mb.literal_histograms, mb.literal_histograms_size, numLiteralSymbols, tree, bw)
-	buildAndStoreEntropyCodesCommand(command_enc, mb.command_histograms, mb.command_histograms_size, numCommandSymbols, tree, bw)
-	buildAndStoreEntropyCodesDistance(distance_enc, mb.distance_histograms, mb.distance_histograms_size, uint(num_distance_symbols), tree, bw)
+	buildAndStoreEntropyCodesLiteral(literal_enc, mb.literal_histograms, mb.literal_histograms_size, numLiteralSymbols, tree, storage_ix, storage)
+	buildAndStoreEntropyCodesCommand(command_enc, mb.command_histograms, mb.command_histograms_size, numCommandSymbols, tree, storage_ix, storage)
+	buildAndStoreEntropyCodesDistance(distance_enc, mb.distance_histograms, mb.distance_histograms_size, uint(num_distance_symbols), tree, storage_ix, storage)
 	tree = nil
 
 	for _, cmd := range commands {
 		var cmd_code uint = uint(cmd.cmd_prefix_)
-		storeSymbol(command_enc, cmd_code, bw)
-		storeCommandExtra(&cmd, bw)
+		storeSymbol(command_enc, cmd_code, storage_ix, storage)
+		storeCommandExtra(&cmd, storage_ix, storage)
 		if mb.literal_context_map_size == 0 {
 			var j uint
 			for j = uint(cmd.insert_len_); j != 0; j-- {
-				storeSymbol(literal_enc, uint(input[pos&mask]), bw)
+				storeSymbol(literal_enc, uint(input[pos&mask]), storage_ix, storage)
 				pos++
 			}
 		} else {
@@ -1085,7 +1332,7 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 			for j = uint(cmd.insert_len_); j != 0; j-- {
 				var context uint = uint(getContext(prev_byte, prev_byte2, literal_context_lut))
 				var literal byte = input[pos&mask]
-				storeSymbolWithContext(literal_enc, uint(literal), context, mb.literal_context_map, bw, literalContextBits)
+				storeSymbolWithContext(literal_enc, uint(literal), context, mb.literal_context_map, storage_ix, storage, literalContextBits)
 				prev_byte2 = prev_byte
 				prev_byte = literal
 				pos++
@@ -1101,13 +1348,13 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 				var distnumextra uint32 = uint32(cmd.dist_prefix_) >> 10
 				var distextra uint64 = uint64(cmd.dist_extra_)
 				if mb.distance_context_map_size == 0 {
-					storeSymbol(distance_enc, dist_code, bw)
+					storeSymbol(distance_enc, dist_code, storage_ix, storage)
 				} else {
 					var context uint = uint(commandDistanceContext(&cmd))
-					storeSymbolWithContext(distance_enc, dist_code, context, mb.distance_context_map, bw, distanceContextBits)
+					storeSymbolWithContext(distance_enc, dist_code, context, mb.distance_context_map, storage_ix, storage, distanceContextBits)
 				}
 
-				bw.writeBits(uint(distnumextra), distextra)
+				writeBits(uint(distnumextra), distextra, storage_ix, storage)
 			}
 		}
 	}
@@ -1116,7 +1363,7 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 	cleanupBlockEncoder(command_enc)
 	cleanupBlockEncoder(literal_enc)
 	if is_last {
-		bw.jumpToByteBoundary()
+		jumpToByteBoundary(storage_ix, storage)
 	}
 }
 
@@ -1137,16 +1384,16 @@ func buildHistograms(input []byte, start_pos uint, mask uint, commands []command
 	}
 }
 
-func storeDataWithHuffmanCodes(input []byte, start_pos uint, mask uint, commands []command, lit_depth []byte, lit_bits []uint16, cmd_depth []byte, cmd_bits []uint16, dist_depth []byte, dist_bits []uint16, bw *bitWriter) {
+func storeDataWithHuffmanCodes(input []byte, start_pos uint, mask uint, commands []command, lit_depth []byte, lit_bits []uint16, cmd_depth []byte, cmd_bits []uint16, dist_depth []byte, dist_bits []uint16, storage_ix *uint, storage []byte) {
 	var pos uint = start_pos
 	for _, cmd := range commands {
 		var cmd_code uint = uint(cmd.cmd_prefix_)
 		var j uint
-		bw.writeBits(uint(cmd_depth[cmd_code]), uint64(cmd_bits[cmd_code]))
-		storeCommandExtra(&cmd, bw)
+		writeBits(uint(cmd_depth[cmd_code]), uint64(cmd_bits[cmd_code]), storage_ix, storage)
+		storeCommandExtra(&cmd, storage_ix, storage)
 		for j = uint(cmd.insert_len_); j != 0; j-- {
 			var literal byte = input[pos&mask]
-			bw.writeBits(uint(lit_depth[literal]), uint64(lit_bits[literal]))
+			writeBits(uint(lit_depth[literal]), uint64(lit_bits[literal]), storage_ix, storage)
 			pos++
 		}
 
@@ -1155,13 +1402,13 @@ func storeDataWithHuffmanCodes(input []byte, start_pos uint, mask uint, commands
 			var dist_code uint = uint(cmd.dist_prefix_) & 0x3FF
 			var distnumextra uint32 = uint32(cmd.dist_prefix_) >> 10
 			var distextra uint32 = cmd.dist_extra_
-			bw.writeBits(uint(dist_depth[dist_code]), uint64(dist_bits[dist_code]))
-			bw.writeBits(uint(distnumextra), uint64(distextra))
+			writeBits(uint(dist_depth[dist_code]), uint64(dist_bits[dist_code]), storage_ix, storage)
+			writeBits(uint(distnumextra), uint64(distextra), storage_ix, storage)
 		}
 	}
 }
 
-func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, bw *bitWriter) {
+func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, storage_ix *uint, storage []byte) {
 	var lit_histo histogramLiteral
 	var cmd_histo histogramCommand
 	var dist_histo histogramDistance
@@ -1174,7 +1421,7 @@ func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint,
 	var tree []huffmanTree
 	var num_distance_symbols uint32 = params.dist.alphabet_size
 
-	storeCompressedMetaBlockHeader(is_last, length, bw)
+	storeCompressedMetaBlockHeader(is_last, length, storage_ix, storage)
 
 	histogramClearLiteral(&lit_histo)
 	histogramClearCommand(&cmd_histo)
@@ -1182,26 +1429,26 @@ func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint,
 
 	buildHistograms(input, start_pos, mask, commands, &lit_histo, &cmd_histo, &dist_histo)
 
-	bw.writeBits(13, 0)
+	writeBits(13, 0, storage_ix, storage)
 
 	tree = make([]huffmanTree, maxHuffmanTreeSize)
-	buildAndStoreHuffmanTree(lit_histo.data_[:], numLiteralSymbols, numLiteralSymbols, tree, lit_depth[:], lit_bits[:], bw)
-	buildAndStoreHuffmanTree(cmd_histo.data_[:], numCommandSymbols, numCommandSymbols, tree, cmd_depth[:], cmd_bits[:], bw)
-	buildAndStoreHuffmanTree(dist_histo.data_[:], maxSimpleDistanceAlphabetSize, uint(num_distance_symbols), tree, dist_depth[:], dist_bits[:], bw)
+	buildAndStoreHuffmanTree(lit_histo.data_[:], numLiteralSymbols, numLiteralSymbols, tree, lit_depth[:], lit_bits[:], storage_ix, storage)
+	buildAndStoreHuffmanTree(cmd_histo.data_[:], numCommandSymbols, numCommandSymbols, tree, cmd_depth[:], cmd_bits[:], storage_ix, storage)
+	buildAndStoreHuffmanTree(dist_histo.data_[:], maxSimpleDistanceAlphabetSize, uint(num_distance_symbols), tree, dist_depth[:], dist_bits[:], storage_ix, storage)
 	tree = nil
-	storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], bw)
+	storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], storage_ix, storage)
 	if is_last {
-		bw.jumpToByteBoundary()
+		jumpToByteBoundary(storage_ix, storage)
 	}
 }
 
-func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, bw *bitWriter) {
+func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, storage_ix *uint, storage []byte) {
 	var num_distance_symbols uint32 = params.dist.alphabet_size
 	var distance_alphabet_bits uint32 = log2FloorNonZero(uint(num_distance_symbols-1)) + 1
 
-	storeCompressedMetaBlockHeader(is_last, length, bw)
+	storeCompressedMetaBlockHeader(is_last, length, storage_ix, storage)
 
-	bw.writeBits(13, 0)
+	writeBits(13, 0, storage_ix, storage)
 
 	if len(commands) <= 128 {
 		var histogram = [numLiteralSymbols]uint32{0}
@@ -1221,11 +1468,11 @@ func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is
 		}
 
 		buildAndStoreHuffmanTreeFast(histogram[:], num_literals, /* max_bits = */
-			8, lit_depth[:], lit_bits[:], bw)
+			8, lit_depth[:], lit_bits[:], storage_ix, storage)
 
-		storeStaticCommandHuffmanTree(bw)
-		storeStaticDistanceHuffmanTree(bw)
-		storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], kStaticCommandCodeDepth[:], kStaticCommandCodeBits[:], kStaticDistanceCodeDepth[:], kStaticDistanceCodeBits[:], bw)
+		storeStaticCommandHuffmanTree(storage_ix, storage)
+		storeStaticDistanceHuffmanTree(storage_ix, storage)
+		storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], kStaticCommandCodeDepth[:], kStaticCommandCodeBits[:], kStaticDistanceCodeDepth[:], kStaticDistanceCodeBits[:], storage_ix, storage)
 	} else {
 		var lit_histo histogramLiteral
 		var cmd_histo histogramCommand
@@ -1241,43 +1488,52 @@ func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is
 		histogramClearDistance(&dist_histo)
 		buildHistograms(input, start_pos, mask, commands, &lit_histo, &cmd_histo, &dist_histo)
 		buildAndStoreHuffmanTreeFast(lit_histo.data_[:], lit_histo.total_count_, /* max_bits = */
-			8, lit_depth[:], lit_bits[:], bw)
+			8, lit_depth[:], lit_bits[:], storage_ix, storage)
 
 		buildAndStoreHuffmanTreeFast(cmd_histo.data_[:], cmd_histo.total_count_, /* max_bits = */
-			10, cmd_depth[:], cmd_bits[:], bw)
+			10, cmd_depth[:], cmd_bits[:], storage_ix, storage)
 
 		buildAndStoreHuffmanTreeFast(dist_histo.data_[:], dist_histo.total_count_, /* max_bits = */
-			uint(distance_alphabet_bits), dist_depth[:], dist_bits[:], bw)
+			uint(distance_alphabet_bits), dist_depth[:], dist_bits[:], storage_ix, storage)
 
-		storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], bw)
+		storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], storage_ix, storage)
 	}
 
 	if is_last {
-		bw.jumpToByteBoundary()
+		jumpToByteBoundary(storage_ix, storage)
 	}
 }
 
-/* This is for storing uncompressed blocks (simple raw storage of
-   bytes-as-bytes). */
-func storeUncompressedMetaBlock(is_final_block bool, input []byte, position uint, mask uint, len uint, bw *bitWriter) {
+/*
+This is for storing uncompressed blocks (simple raw storage of
+
+	bytes-as-bytes).
+*/
+func storeUncompressedMetaBlock(is_final_block bool, input []byte, position uint, mask uint, len uint, storage_ix *uint, storage []byte) {
 	var masked_pos uint = position & mask
-	storeUncompressedMetaBlockHeader(uint(len), bw)
-	bw.jumpToByteBoundary()
+	storeUncompressedMetaBlockHeader(uint(len), storage_ix, storage)
+	jumpToByteBoundary(storage_ix, storage)
 
 	if masked_pos+len > mask+1 {
 		var len1 uint = mask + 1 - masked_pos
-		bw.writeBytes(input[masked_pos:][:len1])
+		copy(storage[*storage_ix>>3:], input[masked_pos:][:len1])
+		*storage_ix += len1 << 3
 		len -= len1
 		masked_pos = 0
 	}
 
-	bw.writeBytes(input[masked_pos:][:len])
+	copy(storage[*storage_ix>>3:], input[masked_pos:][:len])
+	*storage_ix += uint(len << 3)
+
+	/* We need to clear the next 4 bytes to continue to be
+	   compatible with BrotliWriteBits. */
+	writeBitsPrepareStorage(*storage_ix, storage)
 
 	/* Since the uncompressed block itself may not be the final block, add an
 	   empty one after this. */
 	if is_final_block {
-		bw.writeBits(1, 1) /* islast */
-		bw.writeBits(1, 1) /* isempty */
-		bw.jumpToByteBoundary()
+		writeBits(1, 1, storage_ix, storage) /* islast */
+		writeBits(1, 1, storage_ix, storage) /* isempty */
+		jumpToByteBoundary(storage_ix, storage)
 	}
 }
diff --git a/vendor/github.com/andybalholm/brotli/cluster_command.go b/vendor/github.com/andybalholm/brotli/cluster_command.go
index 7449751b21..45b569bb2a 100644
--- a/vendor/github.com/andybalholm/brotli/cluster_command.go
+++ b/vendor/github.com/andybalholm/brotli/cluster_command.go
@@ -1,7 +1,5 @@
 package brotli
 
-import "math"
-
 /* Copyright 2013 Google Inc. All Rights Reserved.
 
    Distributed under MIT license.
@@ -164,163 +162,3 @@ func histogramBitCostDistanceCommand(histogram *histogramCommand, candidate *his
 		return populationCostCommand(&tmp) - candidate.bit_cost_
 	}
 }
-
-/* Find the best 'out' histogram for each of the 'in' histograms.
-   When called, clusters[0..num_clusters) contains the unique values from
-   symbols[0..in_size), but this property is not preserved in this function.
-   Note: we assume that out[]->bit_cost_ is already up-to-date. */
-func histogramRemapCommand(in []histogramCommand, in_size uint, clusters []uint32, num_clusters uint, out []histogramCommand, symbols []uint32) {
-	var i uint
-	for i = 0; i < in_size; i++ {
-		var best_out uint32
-		if i == 0 {
-			best_out = symbols[0]
-		} else {
-			best_out = symbols[i-1]
-		}
-		var best_bits float64 = histogramBitCostDistanceCommand(&in[i], &out[best_out])
-		var j uint
-		for j = 0; j < num_clusters; j++ {
-			var cur_bits float64 = histogramBitCostDistanceCommand(&in[i], &out[clusters[j]])
-			if cur_bits < best_bits {
-				best_bits = cur_bits
-				best_out = clusters[j]
-			}
-		}
-
-		symbols[i] = best_out
-	}
-
-	/* Recompute each out based on raw and symbols. */
-	for i = 0; i < num_clusters; i++ {
-		histogramClearCommand(&out[clusters[i]])
-	}
-
-	for i = 0; i < in_size; i++ {
-		histogramAddHistogramCommand(&out[symbols[i]], &in[i])
-	}
-}
-
-/* Reorders elements of the out[0..length) array and changes values in
-   symbols[0..length) array in the following way:
-     * when called, symbols[] contains indexes into out[], and has N unique
-       values (possibly N < length)
-     * on return, symbols'[i] = f(symbols[i]) and
-                  out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
-       where f is a bijection between the range of symbols[] and [0..N), and
-       the first occurrences of values in symbols'[i] come in consecutive
-       increasing order.
-   Returns N, the number of unique values in symbols[]. */
-
-var histogramReindexCommand_kInvalidIndex uint32 = math.MaxUint32
-
-func histogramReindexCommand(out []histogramCommand, symbols []uint32, length uint) uint {
-	var new_index []uint32 = make([]uint32, length)
-	var next_index uint32
-	var tmp []histogramCommand
-	var i uint
-	for i = 0; i < length; i++ {
-		new_index[i] = histogramReindexCommand_kInvalidIndex
-	}
-
-	next_index = 0
-	for i = 0; i < length; i++ {
-		if new_index[symbols[i]] == histogramReindexCommand_kInvalidIndex {
-			new_index[symbols[i]] = next_index
-			next_index++
-		}
-	}
-
-	/* TODO: by using idea of "cycle-sort" we can avoid allocation of
-	   tmp and reduce the number of copying by the factor of 2. */
-	tmp = make([]histogramCommand, next_index)
-
-	next_index = 0
-	for i = 0; i < length; i++ {
-		if new_index[symbols[i]] == next_index {
-			tmp[next_index] = out[symbols[i]]
-			next_index++
-		}
-
-		symbols[i] = new_index[symbols[i]]
-	}
-
-	new_index = nil
-	for i = 0; uint32(i) < next_index; i++ {
-		out[i] = tmp[i]
-	}
-
-	tmp = nil
-	return uint(next_index)
-}
-
-func clusterHistogramsCommand(in []histogramCommand, in_size uint, max_histograms uint, out []histogramCommand, out_size *uint, histogram_symbols []uint32) {
-	var cluster_size []uint32 = make([]uint32, in_size)
-	var clusters []uint32 = make([]uint32, in_size)
-	var num_clusters uint = 0
-	var max_input_histograms uint = 64
-	var pairs_capacity uint = max_input_histograms * max_input_histograms / 2
-	var pairs []histogramPair = make([]histogramPair, (pairs_capacity + 1))
-	var i uint
-
-	/* For the first pass of clustering, we allow all pairs. */
-	for i = 0; i < in_size; i++ {
-		cluster_size[i] = 1
-	}
-
-	for i = 0; i < in_size; i++ {
-		out[i] = in[i]
-		out[i].bit_cost_ = populationCostCommand(&in[i])
-		histogram_symbols[i] = uint32(i)
-	}
-
-	for i = 0; i < in_size; i += max_input_histograms {
-		var num_to_combine uint = brotli_min_size_t(in_size-i, max_input_histograms)
-		var num_new_clusters uint
-		var j uint
-		for j = 0; j < num_to_combine; j++ {
-			clusters[num_clusters+j] = uint32(i + j)
-		}
-
-		num_new_clusters = histogramCombineCommand(out, cluster_size, histogram_symbols[i:], clusters[num_clusters:], pairs, num_to_combine, num_to_combine, max_histograms, pairs_capacity)
-		num_clusters += num_new_clusters
-	}
-	{
-		/* For the second pass, we limit the total number of histogram pairs.
-		   After this limit is reached, we only keep searching for the best pair. */
-		var max_num_pairs uint = brotli_min_size_t(64*num_clusters, (num_clusters/2)*num_clusters)
-		if pairs_capacity < (max_num_pairs + 1) {
-			var _new_size uint
-			if pairs_capacity == 0 {
-				_new_size = max_num_pairs + 1
-			} else {
-				_new_size = pairs_capacity
-			}
-			var new_array []histogramPair
-			for _new_size < (max_num_pairs + 1) {
-				_new_size *= 2
-			}
-			new_array = make([]histogramPair, _new_size)
-			if pairs_capacity != 0 {
-				copy(new_array, pairs[:pairs_capacity])
-			}
-
-			pairs = new_array
-			pairs_capacity = _new_size
-		}
-
-		/* Collapse similar histograms. */
-		num_clusters = histogramCombineCommand(out, cluster_size, histogram_symbols, clusters, pairs, num_clusters, in_size, max_histograms, max_num_pairs)
-	}
-
-	pairs = nil
-	cluster_size = nil
-
-	/* Find the optimal map from original histograms to the final ones. */
-	histogramRemapCommand(in, in_size, clusters, num_clusters, out, histogram_symbols)
-
-	clusters = nil
-
-	/* Convert the context map to a canonical form. */
-	*out_size = histogramReindexCommand(out, histogram_symbols, in_size)
-}
diff --git a/vendor/github.com/andybalholm/brotli/compress_fragment.go b/vendor/github.com/andybalholm/brotli/compress_fragment.go
index dbf0c43bf2..c9bd057705 100644
--- a/vendor/github.com/andybalholm/brotli/compress_fragment.go
+++ b/vendor/github.com/andybalholm/brotli/compress_fragment.go
@@ -45,7 +45,7 @@ func isMatch5(p1 []byte, p2 []byte) bool {
    and thus have to assign a non-zero depth for each literal.
    Returns estimated compression ratio millibytes/char for encoding given input
    with generated code. */
-func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte, bits []uint16, bw *bitWriter) uint {
+func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte, bits []uint16, storage_ix *uint, storage []byte) uint {
 	var histogram = [256]uint32{0}
 	var histogram_total uint
 	var i uint
@@ -82,7 +82,7 @@ func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte
 	}
 
 	buildAndStoreHuffmanTreeFast(histogram[:], histogram_total, /* max_bits = */
-		8, depths, bits, bw)
+		8, depths, bits, storage_ix, storage)
 	{
 		var literal_ratio uint = 0
 		for i = 0; i < 256; i++ {
@@ -98,7 +98,7 @@ func buildAndStoreLiteralPrefixCode(input []byte, input_size uint, depths []byte
 
 /* Builds a command and distance prefix code (each 64 symbols) into "depth" and
    "bits" based on "histogram" and stores it into the bit stream. */
-func buildAndStoreCommandPrefixCode1(histogram []uint32, depth []byte, bits []uint16, bw *bitWriter) {
+func buildAndStoreCommandPrefixCode1(histogram []uint32, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var tree [129]huffmanTree
 	var cmd_depth = [numCommandSymbols]byte{0}
 	/* Tree size for building a tree over 64 symbols is 2 * 64 + 1. */
@@ -145,141 +145,141 @@ func buildAndStoreCommandPrefixCode1(histogram []uint32, depth []byte, bits []ui
 			cmd_depth[448+8*i] = depth[56+i]
 		}
 
-		storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], bw)
+		storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], storage_ix, storage)
 	}
 
-	storeHuffmanTree(depth[64:], 64, tree[:], bw)
+	storeHuffmanTree(depth[64:], 64, tree[:], storage_ix, storage)
 }
 
 /* REQUIRES: insertlen < 6210 */
-func emitInsertLen1(insertlen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
+func emitInsertLen1(insertlen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
 	if insertlen < 6 {
 		var code uint = insertlen + 40
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
 		histo[code]++
 	} else if insertlen < 130 {
 		var tail uint = insertlen - 2
 		var nbits uint32 = log2FloorNonZero(tail) - 1
 		var prefix uint = tail >> nbits
 		var inscode uint = uint((nbits << 1) + uint32(prefix) + 42)
-		bw.writeBits(uint(depth[inscode]), uint64(bits[inscode]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits))
+		writeBits(uint(depth[inscode]), uint64(bits[inscode]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits), storage_ix, storage)
 		histo[inscode]++
 	} else if insertlen < 2114 {
 		var tail uint = insertlen - 66
 		var nbits uint32 = log2FloorNonZero(tail)
 		var code uint = uint(nbits + 50)
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits), storage_ix, storage)
 		histo[code]++
 	} else {
-		bw.writeBits(uint(depth[61]), uint64(bits[61]))
-		bw.writeBits(12, uint64(insertlen)-2114)
+		writeBits(uint(depth[61]), uint64(bits[61]), storage_ix, storage)
+		writeBits(12, uint64(insertlen)-2114, storage_ix, storage)
 		histo[61]++
 	}
 }
 
-func emitLongInsertLen(insertlen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
+func emitLongInsertLen(insertlen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
 	if insertlen < 22594 {
-		bw.writeBits(uint(depth[62]), uint64(bits[62]))
-		bw.writeBits(14, uint64(insertlen)-6210)
+		writeBits(uint(depth[62]), uint64(bits[62]), storage_ix, storage)
+		writeBits(14, uint64(insertlen)-6210, storage_ix, storage)
 		histo[62]++
 	} else {
-		bw.writeBits(uint(depth[63]), uint64(bits[63]))
-		bw.writeBits(24, uint64(insertlen)-22594)
+		writeBits(uint(depth[63]), uint64(bits[63]), storage_ix, storage)
+		writeBits(24, uint64(insertlen)-22594, storage_ix, storage)
 		histo[63]++
 	}
 }
 
-func emitCopyLen1(copylen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
+func emitCopyLen1(copylen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
 	if copylen < 10 {
-		bw.writeBits(uint(depth[copylen+14]), uint64(bits[copylen+14]))
+		writeBits(uint(depth[copylen+14]), uint64(bits[copylen+14]), storage_ix, storage)
 		histo[copylen+14]++
 	} else if copylen < 134 {
 		var tail uint = copylen - 6
 		var nbits uint32 = log2FloorNonZero(tail) - 1
 		var prefix uint = tail >> nbits
 		var code uint = uint((nbits << 1) + uint32(prefix) + 20)
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits), storage_ix, storage)
 		histo[code]++
 	} else if copylen < 2118 {
 		var tail uint = copylen - 70
 		var nbits uint32 = log2FloorNonZero(tail)
 		var code uint = uint(nbits + 28)
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits), storage_ix, storage)
 		histo[code]++
 	} else {
-		bw.writeBits(uint(depth[39]), uint64(bits[39]))
-		bw.writeBits(24, uint64(copylen)-2118)
+		writeBits(uint(depth[39]), uint64(bits[39]), storage_ix, storage)
+		writeBits(24, uint64(copylen)-2118, storage_ix, storage)
 		histo[39]++
 	}
 }
 
-func emitCopyLenLastDistance1(copylen uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
+func emitCopyLenLastDistance1(copylen uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
 	if copylen < 12 {
-		bw.writeBits(uint(depth[copylen-4]), uint64(bits[copylen-4]))
+		writeBits(uint(depth[copylen-4]), uint64(bits[copylen-4]), storage_ix, storage)
 		histo[copylen-4]++
 	} else if copylen < 72 {
 		var tail uint = copylen - 8
 		var nbits uint32 = log2FloorNonZero(tail) - 1
 		var prefix uint = tail >> nbits
 		var code uint = uint((nbits << 1) + uint32(prefix) + 4)
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(prefix)<<nbits), storage_ix, storage)
 		histo[code]++
 	} else if copylen < 136 {
 		var tail uint = copylen - 8
 		var code uint = (tail >> 5) + 30
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(5, uint64(tail)&31)
-		bw.writeBits(uint(depth[64]), uint64(bits[64]))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(5, uint64(tail)&31, storage_ix, storage)
+		writeBits(uint(depth[64]), uint64(bits[64]), storage_ix, storage)
 		histo[code]++
 		histo[64]++
 	} else if copylen < 2120 {
 		var tail uint = copylen - 72
 		var nbits uint32 = log2FloorNonZero(tail)
 		var code uint = uint(nbits + 28)
-		bw.writeBits(uint(depth[code]), uint64(bits[code]))
-		bw.writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits))
-		bw.writeBits(uint(depth[64]), uint64(bits[64]))
+		writeBits(uint(depth[code]), uint64(bits[code]), storage_ix, storage)
+		writeBits(uint(nbits), uint64(tail)-(uint64(uint(1))<<nbits), storage_ix, storage)
+		writeBits(uint(depth[64]), uint64(bits[64]), storage_ix, storage)
 		histo[code]++
 		histo[64]++
 	} else {
-		bw.writeBits(uint(depth[39]), uint64(bits[39]))
-		bw.writeBits(24, uint64(copylen)-2120)
-		bw.writeBits(uint(depth[64]), uint64(bits[64]))
+		writeBits(uint(depth[39]), uint64(bits[39]), storage_ix, storage)
+		writeBits(24, uint64(copylen)-2120, storage_ix, storage)
+		writeBits(uint(depth[64]), uint64(bits[64]), storage_ix, storage)
 		histo[39]++
 		histo[64]++
 	}
 }
 
-func emitDistance1(distance uint, depth []byte, bits []uint16, histo []uint32, bw *bitWriter) {
+func emitDistance1(distance uint, depth []byte, bits []uint16, histo []uint32, storage_ix *uint, storage []byte) {
 	var d uint = distance + 3
 	var nbits uint32 = log2FloorNonZero(d) - 1
 	var prefix uint = (d >> nbits) & 1
 	var offset uint = (2 + prefix) << nbits
 	var distcode uint = uint(2*(nbits-1) + uint32(prefix) + 80)
-	bw.writeBits(uint(depth[distcode]), uint64(bits[distcode]))
-	bw.writeBits(uint(nbits), uint64(d)-uint64(offset))
+	writeBits(uint(depth[distcode]), uint64(bits[distcode]), storage_ix, storage)
+	writeBits(uint(nbits), uint64(d)-uint64(offset), storage_ix, storage)
 	histo[distcode]++
 }
 
-func emitLiterals(input []byte, len uint, depth []byte, bits []uint16, bw *bitWriter) {
+func emitLiterals(input []byte, len uint, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var j uint
 	for j = 0; j < len; j++ {
 		var lit byte = input[j]
-		bw.writeBits(uint(depth[lit]), uint64(bits[lit]))
+		writeBits(uint(depth[lit]), uint64(bits[lit]), storage_ix, storage)
 	}
 }
 
 /* REQUIRES: len <= 1 << 24. */
-func storeMetaBlockHeader1(len uint, is_uncompressed bool, bw *bitWriter) {
+func storeMetaBlockHeader1(len uint, is_uncompressed bool, storage_ix *uint, storage []byte) {
 	var nibbles uint = 6
 
 	/* ISLAST */
-	bw.writeBits(1, 0)
+	writeBits(1, 0, storage_ix, storage)
 
 	if len <= 1<<16 {
 		nibbles = 4
@@ -287,11 +287,34 @@ func storeMetaBlockHeader1(len uint, is_uncompressed bool, bw *bitWriter) {
 		nibbles = 5
 	}
 
-	bw.writeBits(2, uint64(nibbles)-4)
-	bw.writeBits(nibbles*4, uint64(len)-1)
+	writeBits(2, uint64(nibbles)-4, storage_ix, storage)
+	writeBits(nibbles*4, uint64(len)-1, storage_ix, storage)
 
 	/* ISUNCOMPRESSED */
-	bw.writeSingleBit(is_uncompressed)
+	writeSingleBit(is_uncompressed, storage_ix, storage)
+}
+
+func updateBits(n_bits uint, bits uint32, pos uint, array []byte) {
+	for n_bits > 0 {
+		var byte_pos uint = pos >> 3
+		var n_unchanged_bits uint = pos & 7
+		var n_changed_bits uint = brotli_min_size_t(n_bits, 8-n_unchanged_bits)
+		var total_bits uint = n_unchanged_bits + n_changed_bits
+		var mask uint32 = (^((1 << total_bits) - 1)) | ((1 << n_unchanged_bits) - 1)
+		var unchanged_bits uint32 = uint32(array[byte_pos]) & mask
+		var changed_bits uint32 = bits & ((1 << n_changed_bits) - 1)
+		array[byte_pos] = byte(changed_bits<<n_unchanged_bits | unchanged_bits)
+		n_bits -= n_changed_bits
+		bits >>= n_changed_bits
+		pos += n_changed_bits
+	}
+}
+
+func rewindBitPosition1(new_storage_ix uint, storage_ix *uint, storage []byte) {
+	var bitpos uint = new_storage_ix & 7
+	var mask uint = (1 << bitpos) - 1
+	storage[new_storage_ix>>3] &= byte(mask)
+	*storage_ix = new_storage_ix
 }
 
 var shouldMergeBlock_kSampleRate uint = 43
@@ -322,26 +345,151 @@ func shouldUseUncompressedMode(metablock_start []byte, next_emit []byte, insertl
 	}
 }
 
-func emitUncompressedMetaBlock1(data []byte, storage_ix_start uint, bw *bitWriter) {
-	bw.rewind(storage_ix_start)
-	storeMetaBlockHeader1(uint(len(data)), true, bw)
-	bw.jumpToByteBoundary()
-	bw.writeBytes(data)
+func emitUncompressedMetaBlock1(begin []byte, end []byte, storage_ix_start uint, storage_ix *uint, storage []byte) {
+	var len uint = uint(-cap(end) + cap(begin))
+	rewindBitPosition1(storage_ix_start, storage_ix, storage)
+	storeMetaBlockHeader1(uint(len), true, storage_ix, storage)
+	*storage_ix = (*storage_ix + 7) &^ 7
+	copy(storage[*storage_ix>>3:], begin[:len])
+	*storage_ix += uint(len << 3)
+	storage[*storage_ix>>3] = 0
 }
 
 var kCmdHistoSeed = [128]uint32{
-	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 0, 0, 0, 0,
+	0,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	0,
+	0,
+	0,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	0,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	1,
+	0,
+	0,
+	0,
+	0,
 }
 
 var compressFragmentFastImpl_kFirstBlockSize uint = 3 << 15
 var compressFragmentFastImpl_kMergeBlockSize uint = 1 << 16
 
-func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []int, table_bits uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, bw *bitWriter) {
+func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []int, table_bits uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, storage_ix *uint, storage []byte) {
 	var cmd_histo [128]uint32
 	var ip_end int
 	var next_emit int = 0
@@ -352,7 +500,7 @@ func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []
 	var metablock_start int = input
 	var block_size uint = brotli_min_size_t(input_size, compressFragmentFastImpl_kFirstBlockSize)
 	var total_block_size uint = block_size
-	var mlen_storage_ix uint = bw.getPos() + 3
+	var mlen_storage_ix uint = *storage_ix + 3
 	var lit_depth [256]byte
 	var lit_bits [256]uint16
 	var literal_ratio uint
@@ -369,21 +517,21 @@ func compressFragmentFastImpl(in []byte, input_size uint, is_last bool, table []
 
 	/* Save the bit position of the MLEN field of the meta-block header, so that
 	   we can update it later if we decide to extend this meta-block. */
-	storeMetaBlockHeader1(block_size, false, bw)
+	storeMetaBlockHeader1(block_size, false, storage_ix, storage)
 
 	/* No block splits, no contexts. */
-	bw.writeBits(13, 0)
+	writeBits(13, 0, storage_ix, storage)
 
-	literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], bw)
+	literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], storage_ix, storage)
 	{
 		/* Store the pre-compressed command and distance prefix codes. */
 		var i uint
 		for i = 0; i+7 < *cmd_code_numbits; i += 8 {
-			bw.writeBits(8, uint64(cmd_code[i>>3]))
+			writeBits(8, uint64(cmd_code[i>>3]), storage_ix, storage)
 		}
 	}
 
-	bw.writeBits(*cmd_code_numbits&7, uint64(cmd_code[*cmd_code_numbits>>3]))
+	writeBits(*cmd_code_numbits&7, uint64(cmd_code[*cmd_code_numbits>>3]), storage_ix, storage)
 
 	/* Initialize the command and distance histograms. We will gather
 	   statistics of command and distance codes during the processing
@@ -456,7 +604,7 @@ emit_commands:
 				assert(candidate < ip)
 
 				table[hash] = int(ip - base_ip)
-				if !(!isMatch5(in[ip:], in[candidate:])) {
+				if isMatch5(in[ip:], in[candidate:]) {
 					break
 				}
 			}
@@ -482,27 +630,27 @@ emit_commands:
 				var insert uint = uint(base - next_emit)
 				ip += int(matched)
 				if insert < 6210 {
-					emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
+					emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
 				} else if shouldUseUncompressedMode(in[metablock_start:], in[next_emit:], insert, literal_ratio) {
-					emitUncompressedMetaBlock1(in[metablock_start:base], mlen_storage_ix-3, bw)
+					emitUncompressedMetaBlock1(in[metablock_start:], in[base:], mlen_storage_ix-3, storage_ix, storage)
 					input_size -= uint(base - input)
 					input = base
 					next_emit = input
 					goto next_block
 				} else {
-					emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
+					emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
 				}
 
-				emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], bw)
+				emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], storage_ix, storage)
 				if distance == last_distance {
-					bw.writeBits(uint(cmd_depth[64]), uint64(cmd_bits[64]))
+					writeBits(uint(cmd_depth[64]), uint64(cmd_bits[64]), storage_ix, storage)
 					cmd_histo[64]++
 				} else {
-					emitDistance1(uint(distance), cmd_depth, cmd_bits, cmd_histo[:], bw)
+					emitDistance1(uint(distance), cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
 					last_distance = distance
 				}
 
-				emitCopyLenLastDistance1(matched, cmd_depth, cmd_bits, cmd_histo[:], bw)
+				emitCopyLenLastDistance1(matched, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
 
 				next_emit = ip
 				if ip >= ip_limit {
@@ -538,8 +686,8 @@ emit_commands:
 				}
 				ip += int(matched)
 				last_distance = int(base - candidate) /* > 0 */
-				emitCopyLen1(matched, cmd_depth, cmd_bits, cmd_histo[:], bw)
-				emitDistance1(uint(last_distance), cmd_depth, cmd_bits, cmd_histo[:], bw)
+				emitCopyLen1(matched, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
+				emitDistance1(uint(last_distance), cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
 
 				next_emit = ip
 				if ip >= ip_limit {
@@ -585,7 +733,7 @@ emit_remainder:
 		   nibbles. */
 		total_block_size += block_size
 
-		bw.updateBits(20, uint32(total_block_size-1), mlen_storage_ix)
+		updateBits(20, uint32(total_block_size-1), mlen_storage_ix, storage)
 		goto emit_commands
 	}
 
@@ -593,13 +741,13 @@ emit_remainder:
 	if next_emit < ip_end {
 		var insert uint = uint(ip_end - next_emit)
 		if insert < 6210 {
-			emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
-			emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], bw)
+			emitInsertLen1(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
+			emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], storage_ix, storage)
 		} else if shouldUseUncompressedMode(in[metablock_start:], in[next_emit:], insert, literal_ratio) {
-			emitUncompressedMetaBlock1(in[metablock_start:ip_end], mlen_storage_ix-3, bw)
+			emitUncompressedMetaBlock1(in[metablock_start:], in[ip_end:], mlen_storage_ix-3, storage_ix, storage)
 		} else {
-			emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], bw)
-			emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], bw)
+			emitLongInsertLen(insert, cmd_depth, cmd_bits, cmd_histo[:], storage_ix, storage)
+			emitLiterals(in[next_emit:], insert, lit_depth[:], lit_bits[:], storage_ix, storage)
 		}
 	}
 
@@ -615,29 +763,30 @@ next_block:
 
 		/* Save the bit position of the MLEN field of the meta-block header, so that
 		   we can update it later if we decide to extend this meta-block. */
-		mlen_storage_ix = bw.getPos() + 3
+		mlen_storage_ix = *storage_ix + 3
 
-		storeMetaBlockHeader1(block_size, false, bw)
+		storeMetaBlockHeader1(block_size, false, storage_ix, storage)
 
 		/* No block splits, no contexts. */
-		bw.writeBits(13, 0)
+		writeBits(13, 0, storage_ix, storage)
 
-		literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], bw)
-		buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, bw)
+		literal_ratio = buildAndStoreLiteralPrefixCode(in[input:], block_size, lit_depth[:], lit_bits[:], storage_ix, storage)
+		buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, storage_ix, storage)
 		goto emit_commands
 	}
 
 	if !is_last {
 		/* If this is not the last block, update the command and distance prefix
 		   codes for the next block and store the compressed forms. */
-		var bw bitWriter
-		bw.dst = cmd_code
-		buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, &bw)
-		*cmd_code_numbits = bw.getPos()
+		cmd_code[0] = 0
+
+		*cmd_code_numbits = 0
+		buildAndStoreCommandPrefixCode1(cmd_histo[:], cmd_depth, cmd_bits, cmd_code_numbits, cmd_code)
 	}
 }
 
-/* Compresses "input" string to bw as one or more complete meta-blocks.
+/* Compresses "input" string to the "*storage" buffer as one or more complete
+   meta-blocks, and updates the "*storage_ix" bit position.
 
    If "is_last" is 1, emits an additional empty last meta-block.
 
@@ -658,28 +807,28 @@ next_block:
    REQUIRES: "table_size" is an odd (9, 11, 13, 15) power of two
    OUTPUT: maximal copy distance <= |input_size|
    OUTPUT: maximal copy distance <= BROTLI_MAX_BACKWARD_LIMIT(18) */
-func compressFragmentFast(input []byte, input_size uint, is_last bool, table []int, table_size uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, bw *bitWriter) {
-	var initial_storage_ix uint = bw.getPos()
+func compressFragmentFast(input []byte, input_size uint, is_last bool, table []int, table_size uint, cmd_depth []byte, cmd_bits []uint16, cmd_code_numbits *uint, cmd_code []byte, storage_ix *uint, storage []byte) {
+	var initial_storage_ix uint = *storage_ix
 	var table_bits uint = uint(log2FloorNonZero(table_size))
 
 	if input_size == 0 {
 		assert(is_last)
-		bw.writeBits(1, 1) /* islast */
-		bw.writeBits(1, 1) /* isempty */
-		bw.jumpToByteBoundary()
+		writeBits(1, 1, storage_ix, storage) /* islast */
+		writeBits(1, 1, storage_ix, storage) /* isempty */
+		*storage_ix = (*storage_ix + 7) &^ 7
 		return
 	}
 
-	compressFragmentFastImpl(input, input_size, is_last, table, table_bits, cmd_depth, cmd_bits, cmd_code_numbits, cmd_code, bw)
+	compressFragmentFastImpl(input, input_size, is_last, table, table_bits, cmd_depth, cmd_bits, cmd_code_numbits, cmd_code, storage_ix, storage)
 
 	/* If output is larger than single uncompressed block, rewrite it. */
-	if bw.getPos()-initial_storage_ix > 31+(input_size<<3) {
-		emitUncompressedMetaBlock1(input[:input_size], initial_storage_ix, bw)
+	if *storage_ix-initial_storage_ix > 31+(input_size<<3) {
+		emitUncompressedMetaBlock1(input, input[input_size:], initial_storage_ix, storage_ix, storage)
 	}
 
 	if is_last {
-		bw.writeBits(1, 1) /* islast */
-		bw.writeBits(1, 1) /* isempty */
-		bw.jumpToByteBoundary()
+		writeBits(1, 1, storage_ix, storage) /* islast */
+		writeBits(1, 1, storage_ix, storage) /* isempty */
+		*storage_ix = (*storage_ix + 7) &^ 7
 	}
 }
diff --git a/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go b/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go
index 2473aca3fe..c5c663a556 100644
--- a/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go
+++ b/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go
@@ -39,9 +39,12 @@ func isMatch1(p1 []byte, p2 []byte, length uint) bool {
 	return p1[4] == p2[4] && p1[5] == p2[5]
 }
 
-/* Builds a command and distance prefix code (each 64 symbols) into "depth" and
-   "bits" based on "histogram" and stores it into the bit stream. */
-func buildAndStoreCommandPrefixCode(histogram []uint32, depth []byte, bits []uint16, bw *bitWriter) {
+/*
+Builds a command and distance prefix code (each 64 symbols) into "depth" and
+
+	"bits" based on "histogram" and stores it into the bit stream.
+*/
+func buildAndStoreCommandPrefixCode(histogram []uint32, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var tree [129]huffmanTree
 	var cmd_depth = [numCommandSymbols]byte{0}
 	/* Tree size for building a tree over 64 symbols is 2 * 64 + 1. */
@@ -87,10 +90,10 @@ func buildAndStoreCommandPrefixCode(histogram []uint32, depth []byte, bits []uin
 			cmd_depth[448+8*i] = depth[16+i]
 		}
 
-		storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], bw)
+		storeHuffmanTree(cmd_depth[:], numCommandSymbols, tree[:], storage_ix, storage)
 	}
 
-	storeHuffmanTree(depth[64:], 64, tree[:], bw)
+	storeHuffmanTree(depth[64:], 64, tree[:], storage_ix, storage)
 }
 
 func emitInsertLen(insertlen uint32, commands *[]uint32) {
@@ -197,7 +200,26 @@ func emitDistance(distance uint32, commands *[]uint32) {
 }
 
 /* REQUIRES: len <= 1 << 24. */
-func storeMetaBlockHeader(len uint, is_uncompressed bool, bw *bitWriter) {
+func storeMetaBlockHeader(len uint, is_uncompressed bool, storage_ix *uint, storage []byte) {
+	var nibbles uint = 6
+
+	/* ISLAST */
+	writeBits(1, 0, storage_ix, storage)
+
+	if len <= 1<<16 {
+		nibbles = 4
+	} else if len <= 1<<20 {
+		nibbles = 5
+	}
+
+	writeBits(2, uint64(nibbles)-4, storage_ix, storage)
+	writeBits(nibbles*4, uint64(len)-1, storage_ix, storage)
+
+	/* ISUNCOMPRESSED */
+	writeSingleBit(is_uncompressed, storage_ix, storage)
+}
+
+func storeMetaBlockHeaderBW(len uint, is_uncompressed bool, bw *bitWriter) {
 	var nibbles uint = 6
 
 	/* ISLAST */
@@ -207,6 +229,8 @@ func storeMetaBlockHeader(len uint, is_uncompressed bool, bw *bitWriter) {
 		nibbles = 4
 	} else if len <= 1<<20 {
 		nibbles = 5
+	} else if len > 1<<24 {
+		panic("metablock too long")
 	}
 
 	bw.writeBits(2, uint64(nibbles)-4)
@@ -440,20 +464,163 @@ emit_remainder:
 }
 
 var storeCommands_kNumExtraBits = [128]uint32{
-	0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, 12, 14, 24,
-	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4,
-	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, 24,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
-	9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16,
-	17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	1,
+	1,
+	2,
+	2,
+	3,
+	3,
+	4,
+	4,
+	5,
+	5,
+	6,
+	7,
+	8,
+	9,
+	10,
+	12,
+	14,
+	24,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	1,
+	1,
+	2,
+	2,
+	3,
+	3,
+	4,
+	4,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	1,
+	1,
+	2,
+	2,
+	3,
+	3,
+	4,
+	4,
+	5,
+	5,
+	6,
+	7,
+	8,
+	9,
+	10,
+	24,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	1,
+	1,
+	2,
+	2,
+	3,
+	3,
+	4,
+	4,
+	5,
+	5,
+	6,
+	6,
+	7,
+	7,
+	8,
+	8,
+	9,
+	9,
+	10,
+	10,
+	11,
+	11,
+	12,
+	12,
+	13,
+	13,
+	14,
+	14,
+	15,
+	15,
+	16,
+	16,
+	17,
+	17,
+	18,
+	18,
+	19,
+	19,
+	20,
+	20,
+	21,
+	21,
+	22,
+	22,
+	23,
+	23,
+	24,
+	24,
 }
 var storeCommands_kInsertOffset = [24]uint32{
-	0, 1, 2, 3, 4, 5, 6, 8, 10, 14, 18, 26, 34, 50, 66, 98, 130, 194, 322, 578,
-	1090, 2114, 6210, 22594,
+	0,
+	1,
+	2,
+	3,
+	4,
+	5,
+	6,
+	8,
+	10,
+	14,
+	18,
+	26,
+	34,
+	50,
+	66,
+	98,
+	130,
+	194,
+	322,
+	578,
+	1090,
+	2114,
+	6210,
+	22594,
 }
 
-func storeCommands(literals []byte, num_literals uint, commands []uint32, num_commands uint, bw *bitWriter) {
+func storeCommands(literals []byte, num_literals uint, commands []uint32, num_commands uint, storage_ix *uint, storage []byte) {
 	var lit_depths [256]byte
 	var lit_bits [256]uint16
 	var lit_histo = [256]uint32{0}
@@ -466,7 +633,7 @@ func storeCommands(literals []byte, num_literals uint, commands []uint32, num_co
 	}
 
 	buildAndStoreHuffmanTreeFast(lit_histo[:], num_literals, /* max_bits = */
-		8, lit_depths[:], lit_bits[:], bw)
+		8, lit_depths[:], lit_bits[:], storage_ix, storage)
 
 	for i = 0; i < num_commands; i++ {
 		var code uint32 = commands[i] & 0xFF
@@ -478,21 +645,21 @@ func storeCommands(literals []byte, num_literals uint, commands []uint32, num_co
 	cmd_histo[2] += 1
 	cmd_histo[64] += 1
 	cmd_histo[84] += 1
-	buildAndStoreCommandPrefixCode(cmd_histo[:], cmd_depths[:], cmd_bits[:], bw)
+	buildAndStoreCommandPrefixCode(cmd_histo[:], cmd_depths[:], cmd_bits[:], storage_ix, storage)
 
 	for i = 0; i < num_commands; i++ {
 		var cmd uint32 = commands[i]
 		var code uint32 = cmd & 0xFF
 		var extra uint32 = cmd >> 8
 		assert(code < 128)
-		bw.writeBits(uint(cmd_depths[code]), uint64(cmd_bits[code]))
-		bw.writeBits(uint(storeCommands_kNumExtraBits[code]), uint64(extra))
+		writeBits(uint(cmd_depths[code]), uint64(cmd_bits[code]), storage_ix, storage)
+		writeBits(uint(storeCommands_kNumExtraBits[code]), uint64(extra), storage_ix, storage)
 		if code < 24 {
 			var insert uint32 = storeCommands_kInsertOffset[code] + extra
 			var j uint32
 			for j = 0; j < insert; j++ {
 				var lit byte = literals[0]
-				bw.writeBits(uint(lit_depths[lit]), uint64(lit_bits[lit]))
+				writeBits(uint(lit_depths[lit]), uint64(lit_bits[lit]), storage_ix, storage)
 				literals = literals[1:]
 			}
 		}
@@ -520,13 +687,22 @@ func shouldCompress(input []byte, input_size uint, num_literals uint) bool {
 	}
 }
 
-func emitUncompressedMetaBlock(input []byte, input_size uint, bw *bitWriter) {
-	storeMetaBlockHeader(input_size, true, bw)
-	bw.jumpToByteBoundary()
-	bw.writeBytes(input[:input_size])
+func rewindBitPosition(new_storage_ix uint, storage_ix *uint, storage []byte) {
+	var bitpos uint = new_storage_ix & 7
+	var mask uint = (1 << bitpos) - 1
+	storage[new_storage_ix>>3] &= byte(mask)
+	*storage_ix = new_storage_ix
+}
+
+func emitUncompressedMetaBlock(input []byte, input_size uint, storage_ix *uint, storage []byte) {
+	storeMetaBlockHeader(input_size, true, storage_ix, storage)
+	*storage_ix = (*storage_ix + 7) &^ 7
+	copy(storage[*storage_ix>>3:], input[:input_size])
+	*storage_ix += input_size << 3
+	storage[*storage_ix>>3] = 0
 }
 
-func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_bits uint, min_match uint, bw *bitWriter) {
+func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_bits uint, min_match uint, storage_ix *uint, storage []byte) {
 	/* Save the start of the first block for position and distance computations.
 	 */
 	var base_ip []byte = input
@@ -540,17 +716,17 @@ func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, co
 		num_literals = uint(-cap(literals) + cap(literal_buf))
 		if shouldCompress(input, block_size, num_literals) {
 			var num_commands uint = uint(-cap(commands) + cap(command_buf))
-			storeMetaBlockHeader(block_size, false, bw)
+			storeMetaBlockHeader(block_size, false, storage_ix, storage)
 
 			/* No block splits, no contexts. */
-			bw.writeBits(13, 0)
+			writeBits(13, 0, storage_ix, storage)
 
-			storeCommands(literal_buf, num_literals, command_buf, num_commands, bw)
+			storeCommands(literal_buf, num_literals, command_buf, num_commands, storage_ix, storage)
 		} else {
 			/* Since we did not find many backward references and the entropy of
 			   the data is close to 8 bits, we can simply emit an uncompressed block.
 			   This makes compression speed of uncompressible data about 3x faster. */
-			emitUncompressedMetaBlock(input, block_size, bw)
+			emitUncompressedMetaBlock(input, block_size, storage_ix, storage)
 		}
 
 		input = input[block_size:]
@@ -558,20 +734,24 @@ func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, co
 	}
 }
 
-/* Compresses "input" string to bw as one or more complete meta-blocks.
+/*
+Compresses "input" string to the "*storage" buffer as one or more complete
 
-   If "is_last" is 1, emits an additional empty last meta-block.
+	meta-blocks, and updates the "*storage_ix" bit position.
 
-   REQUIRES: "input_size" is greater than zero, or "is_last" is 1.
-   REQUIRES: "input_size" is less or equal to maximal metablock size (1 << 24).
-   REQUIRES: "command_buf" and "literal_buf" point to at least
-              kCompressFragmentTwoPassBlockSize long arrays.
-   REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
-   REQUIRES: "table_size" is a power of two
-   OUTPUT: maximal copy distance <= |input_size|
-   OUTPUT: maximal copy distance <= BROTLI_MAX_BACKWARD_LIMIT(18) */
-func compressFragmentTwoPass(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_size uint, bw *bitWriter) {
-	var initial_storage_ix uint = bw.getPos()
+	If "is_last" is 1, emits an additional empty last meta-block.
+
+	REQUIRES: "input_size" is greater than zero, or "is_last" is 1.
+	REQUIRES: "input_size" is less or equal to maximal metablock size (1 << 24).
+	REQUIRES: "command_buf" and "literal_buf" point to at least
+	           kCompressFragmentTwoPassBlockSize long arrays.
+	REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
+	REQUIRES: "table_size" is a power of two
+	OUTPUT: maximal copy distance <= |input_size|
+	OUTPUT: maximal copy distance <= BROTLI_MAX_BACKWARD_LIMIT(18)
+*/
+func compressFragmentTwoPass(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_size uint, storage_ix *uint, storage []byte) {
+	var initial_storage_ix uint = *storage_ix
 	var table_bits uint = uint(log2FloorNonZero(table_size))
 	var min_match uint
 	if table_bits <= 15 {
@@ -579,17 +759,17 @@ func compressFragmentTwoPass(input []byte, input_size uint, is_last bool, comman
 	} else {
 		min_match = 6
 	}
-	compressFragmentTwoPassImpl(input, input_size, is_last, command_buf, literal_buf, table, table_bits, min_match, bw)
+	compressFragmentTwoPassImpl(input, input_size, is_last, command_buf, literal_buf, table, table_bits, min_match, storage_ix, storage)
 
 	/* If output is larger than single uncompressed block, rewrite it. */
-	if bw.getPos()-initial_storage_ix > 31+(input_size<<3) {
-		bw.rewind(initial_storage_ix)
-		emitUncompressedMetaBlock(input, input_size, bw)
+	if *storage_ix-initial_storage_ix > 31+(input_size<<3) {
+		rewindBitPosition(initial_storage_ix, storage_ix, storage)
+		emitUncompressedMetaBlock(input, input_size, storage_ix, storage)
 	}
 
 	if is_last {
-		bw.writeBits(1, 1) /* islast */
-		bw.writeBits(1, 1) /* isempty */
-		bw.jumpToByteBoundary()
+		writeBits(1, 1, storage_ix, storage) /* islast */
+		writeBits(1, 1, storage_ix, storage) /* isempty */
+		*storage_ix = (*storage_ix + 7) &^ 7
 	}
 }
diff --git a/vendor/github.com/andybalholm/brotli/decode.go b/vendor/github.com/andybalholm/brotli/decode.go
index d2f39a051c..9d9513b7cf 100644
--- a/vendor/github.com/andybalholm/brotli/decode.go
+++ b/vendor/github.com/andybalholm/brotli/decode.go
@@ -50,21 +50,6 @@ const (
 	decoderErrorUnreachable                 = -31
 )
 
-/**
- * The value of the last error code, negative integer.
- *
- * All other error code values are in the range from ::lastErrorCode
- * to @c -1. There are also 4 other possible non-error codes @c 0 .. @c 3 in
- * ::BrotliDecoderErrorCode enumeration.
- */
-const lastErrorCode = decoderErrorUnreachable
-
-/** Options to be used with ::BrotliDecoderSetParameter. */
-const (
-	decoderParamDisableRingBufferReallocation = 0
-	decoderParamLargeWindow                   = 1
-)
-
 const huffmanTableBits = 8
 
 const huffmanTableMask = 0xFF
@@ -81,28 +66,6 @@ var kCodeLengthPrefixLength = [16]byte{2, 2, 2, 3, 2, 2, 2, 4, 2, 2, 2, 3, 2, 2,
 
 var kCodeLengthPrefixValue = [16]byte{0, 4, 3, 2, 0, 4, 3, 1, 0, 4, 3, 2, 0, 4, 3, 5}
 
-func decoderSetParameter(state *Reader, p int, value uint32) bool {
-	if state.state != stateUninited {
-		return false
-	}
-	switch p {
-	case decoderParamDisableRingBufferReallocation:
-		if !(value == 0) {
-			state.canny_ringbuffer_allocation = 0
-		} else {
-			state.canny_ringbuffer_allocation = 1
-		}
-		return true
-
-	case decoderParamLargeWindow:
-		state.large_window = (!(value == 0))
-		return true
-
-	default:
-		return false
-	}
-}
-
 /* Saves error code and converts it to BrotliDecoderResult. */
 func saveErrorCode(s *Reader, e int) int {
 	s.error_code = int(e)
@@ -1125,10 +1088,8 @@ func decodeContextMap(context_map_size uint32, num_htrees *uint32, context_map_a
    Reads 3..54 bits. */
 func decodeBlockTypeAndLength(safe int, s *Reader, tree_type int) bool {
 	var max_block_type uint32 = s.num_block_types[tree_type]
-	var type_tree []huffmanCode
-	type_tree = s.block_type_trees[tree_type*huffmanMaxSize258:]
-	var len_tree []huffmanCode
-	len_tree = s.block_len_trees[tree_type*huffmanMaxSize26:]
+	type_tree := s.block_type_trees[tree_type*huffmanMaxSize258:]
+	len_tree := s.block_len_trees[tree_type*huffmanMaxSize26:]
 	var br *bitReader = &s.br
 	var ringbuffer []uint32 = s.block_type_rb[tree_type*2:]
 	var block_type uint32
@@ -1280,8 +1241,7 @@ func unwrittenBytes(s *Reader, wrap bool) uint {
    Returns BROTLI_DECODER_NEEDS_MORE_OUTPUT only if there is more output to push
    and either ring-buffer is as big as window size, or |force| is true. */
 func writeRingBuffer(s *Reader, available_out *uint, next_out *[]byte, total_out *uint, force bool) int {
-	var start []byte
-	start = s.ringbuffer[s.partial_pos_out&uint(s.ringbuffer_mask):]
+	start := s.ringbuffer[s.partial_pos_out&uint(s.ringbuffer_mask):]
 	var to_write uint = unwrittenBytes(s, true)
 	var num_written uint = *available_out
 	if num_written > to_write {
@@ -1344,26 +1304,21 @@ func wrapRingBuffer(s *Reader) {
    Last two bytes of ring-buffer are initialized to 0, so context calculation
    could be done uniformly for the first two and all other positions. */
 func ensureRingBuffer(s *Reader) bool {
-	var old_ringbuffer []byte = s.ringbuffer
+	var old_ringbuffer []byte
 	if s.ringbuffer_size == s.new_ringbuffer_size {
 		return true
 	}
-
-	s.ringbuffer = make([]byte, uint(s.new_ringbuffer_size)+uint(kRingBufferWriteAheadSlack))
-	if s.ringbuffer == nil {
-		/* Restore previous value. */
-		s.ringbuffer = old_ringbuffer
-
-		return false
+	spaceNeeded := int(s.new_ringbuffer_size) + int(kRingBufferWriteAheadSlack)
+	if len(s.ringbuffer) < spaceNeeded {
+		old_ringbuffer = s.ringbuffer
+		s.ringbuffer = make([]byte, spaceNeeded)
 	}
 
 	s.ringbuffer[s.new_ringbuffer_size-2] = 0
 	s.ringbuffer[s.new_ringbuffer_size-1] = 0
 
-	if !(old_ringbuffer == nil) {
+	if old_ringbuffer != nil {
 		copy(s.ringbuffer, old_ringbuffer[:uint(s.pos)])
-
-		old_ringbuffer = nil
 	}
 
 	s.ringbuffer_size = s.new_ringbuffer_size
@@ -1412,8 +1367,7 @@ func copyUncompressedBlockToOutput(available_out *uint, next_out *[]byte, total_
 
 		case stateUncompressedWrite:
 			{
-				var result int
-				result = writeRingBuffer(s, available_out, next_out, total_out, false)
+				result := writeRingBuffer(s, available_out, next_out, total_out, false)
 				if result != decoderSuccess {
 					return result
 				}
@@ -1931,8 +1885,7 @@ CommandPostDecodeLiterals:
 			}
 
 			if transform_idx < int(trans.num_transforms) {
-				var word []byte
-				word = words.data[offset:]
+				word := words.data[offset:]
 				var len int = i
 				if transform_idx == int(trans.cutOffTransforms[0]) {
 					copy(s.ringbuffer[pos:], word[:uint(len)])
@@ -1954,10 +1907,8 @@ CommandPostDecodeLiterals:
 		}
 	} else {
 		var src_start int = (pos - s.distance_code) & s.ringbuffer_mask
-		var copy_dst []byte
-		copy_dst = s.ringbuffer[pos:]
-		var copy_src []byte
-		copy_src = s.ringbuffer[src_start:]
+		copy_dst := s.ringbuffer[pos:]
+		copy_src := s.ringbuffer[src_start:]
 		var dst_end int = pos + i
 		var src_end int = src_start + i
 
@@ -2494,8 +2445,6 @@ func decoderDecompressStream(s *Reader, available_in *uint, next_in *[]byte, ava
 				} else {
 					s.state = stateCommandBegin
 				}
-
-				break
 			} else if s.state == stateCommandPostWrite2 {
 				s.state = stateCommandPostWrapCopy /* BROTLI_STATE_COMMAND_INNER_WRITE */
 			} else {
diff --git a/vendor/github.com/andybalholm/brotli/encode.go b/vendor/github.com/andybalholm/brotli/encode.go
index 3abaf571af..8e25a4ec78 100644
--- a/vendor/github.com/andybalholm/brotli/encode.go
+++ b/vendor/github.com/andybalholm/brotli/encode.go
@@ -87,9 +87,11 @@ type Writer struct {
 	last_processed_pos_ uint64
 	dist_cache_         [numDistanceShortCodes]int
 	saved_dist_cache_   [4]int
+	last_bytes_         uint16
+	last_bytes_bits_    byte
 	prev_byte_          byte
 	prev_byte2_         byte
-	bw                  bitWriter
+	storage             []byte
 	small_table_        [1 << 10]int
 	large_table_        []int
 	large_table_size_   uint
@@ -139,6 +141,14 @@ func wrapPosition(position uint64) uint32 {
 	return result
 }
 
+func (s *Writer) getStorage(size int) []byte {
+	if len(s.storage) < size {
+		s.storage = make([]byte, size)
+	}
+
+	return s.storage
+}
+
 func hashTableSize(max_table_size uint, input_size uint) uint {
 	var htsize uint = 256
 	for htsize < max_table_size && htsize < input_size {
@@ -184,18 +194,23 @@ func getHashTable(s *Writer, quality int, input_size uint, table_size *uint) []i
 	return table
 }
 
-func encodeWindowBits(lgwin int, large_window bool, bw *bitWriter) {
+func encodeWindowBits(lgwin int, large_window bool, last_bytes *uint16, last_bytes_bits *byte) {
 	if large_window {
-		bw.writeBits(14, uint64((lgwin&0x3F)<<8|0x11))
+		*last_bytes = uint16((lgwin&0x3F)<<8 | 0x11)
+		*last_bytes_bits = 14
 	} else {
 		if lgwin == 16 {
-			bw.writeBits(1, 0)
+			*last_bytes = 0
+			*last_bytes_bits = 1
 		} else if lgwin == 17 {
-			bw.writeBits(7, 1)
+			*last_bytes = 1
+			*last_bytes_bits = 7
 		} else if lgwin > 17 {
-			bw.writeBits(4, uint64((lgwin-17)<<1|0x01))
+			*last_bytes = uint16((lgwin-17)<<1 | 0x01)
+			*last_bytes_bits = 4
 		} else {
-			bw.writeBits(7, uint64((lgwin-8)<<4|0x01))
+			*last_bytes = uint16((lgwin-8)<<4 | 0x01)
+			*last_bytes_bits = 7
 		}
 	}
 }
@@ -417,15 +432,18 @@ func chooseContextMode(params *encoderParams, data []byte, pos uint, mask uint,
 	return contextUTF8
 }
 
-func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes uint, is_last bool, literal_context_mode int, params *encoderParams, prev_byte byte, prev_byte2 byte, num_literals uint, commands []command, saved_dist_cache []int, dist_cache []int, bw *bitWriter) {
+func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes uint, is_last bool, literal_context_mode int, params *encoderParams, prev_byte byte, prev_byte2 byte, num_literals uint, commands []command, saved_dist_cache []int, dist_cache []int, storage_ix *uint, storage []byte) {
 	var wrapped_last_flush_pos uint32 = wrapPosition(last_flush_pos)
+	var last_bytes uint16
+	var last_bytes_bits byte
 	var literal_context_lut contextLUT = getContextLUT(literal_context_mode)
 	var block_params encoderParams = *params
 
 	if bytes == 0 {
 		/* Write the ISLAST and ISEMPTY bits. */
-		bw.writeBits(2, 3)
-		bw.jumpToByteBoundary()
+		writeBits(2, 3, storage_ix, storage)
+
+		*storage_ix = (*storage_ix + 7) &^ 7
 		return
 	}
 
@@ -434,15 +452,17 @@ func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes
 		   CreateBackwardReferences is now unused. */
 		copy(dist_cache, saved_dist_cache[:4])
 
-		storeUncompressedMetaBlock(is_last, data, uint(wrapped_last_flush_pos), mask, bytes, bw)
+		storeUncompressedMetaBlock(is_last, data, uint(wrapped_last_flush_pos), mask, bytes, storage_ix, storage)
 		return
 	}
 
-	savedPos := bw.getPos()
+	assert(*storage_ix <= 14)
+	last_bytes = uint16(storage[1])<<8 | uint16(storage[0])
+	last_bytes_bits = byte(*storage_ix)
 	if params.quality <= maxQualityForStaticEntropyCodes {
-		storeMetaBlockFast(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, bw)
+		storeMetaBlockFast(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, storage_ix, storage)
 	} else if params.quality < minQualityForBlockSplit {
-		storeMetaBlockTrivial(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, bw)
+		storeMetaBlockTrivial(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, storage_ix, storage)
 	} else {
 		mb := getMetaBlockSplit()
 		if params.quality < minQualityForHqBlockSplitting {
@@ -469,15 +489,18 @@ func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes
 			optimizeHistograms(num_effective_dist_codes, mb)
 		}
 
-		storeMetaBlock(data, uint(wrapped_last_flush_pos), bytes, mask, prev_byte, prev_byte2, is_last, &block_params, literal_context_mode, commands, mb, bw)
+		storeMetaBlock(data, uint(wrapped_last_flush_pos), bytes, mask, prev_byte, prev_byte2, is_last, &block_params, literal_context_mode, commands, mb, storage_ix, storage)
 		freeMetaBlockSplit(mb)
 	}
 
-	if bytes+4 < bw.getPos()>>3 {
+	if bytes+4 < *storage_ix>>3 {
 		/* Restore the distance cache and last byte. */
 		copy(dist_cache, saved_dist_cache[:4])
-		bw.rewind(savedPos)
-		storeUncompressedMetaBlock(is_last, data, uint(wrapped_last_flush_pos), mask, bytes, bw)
+
+		storage[0] = byte(last_bytes)
+		storage[1] = byte(last_bytes >> 8)
+		*storage_ix = uint(last_bytes_bits)
+		storeUncompressedMetaBlock(is_last, data, uint(wrapped_last_flush_pos), mask, bytes, storage_ix, storage)
 	}
 }
 
@@ -510,10 +533,8 @@ func ensureInitialized(s *Writer) bool {
 		return true
 	}
 
-	s.bw.bits = 0
-	s.bw.nbits = 0
-	s.bw.dst = s.bw.dst[:0]
-
+	s.last_bytes_bits_ = 0
+	s.last_bytes_ = 0
 	s.remaining_metadata_bytes_ = math.MaxUint32
 
 	sanitizeParams(&s.params)
@@ -529,7 +550,7 @@ func ensureInitialized(s *Writer) bool {
 			lgwin = brotli_max_int(lgwin, 18)
 		}
 
-		encodeWindowBits(lgwin, s.params.large_window, &s.bw)
+		encodeWindowBits(lgwin, s.params.large_window, &s.last_bytes_, &s.last_bytes_bits_)
 	}
 
 	if s.params.quality == fastOnePassCompressionQuality {
@@ -761,6 +782,8 @@ func encodeData(s *Writer, is_last bool, force_flush bool) bool {
 	}
 
 	if s.params.quality == fastOnePassCompressionQuality || s.params.quality == fastTwoPassCompressionQuality {
+		var storage []byte
+		var storage_ix uint = uint(s.last_bytes_bits_)
 		var table_size uint
 		var table []int
 
@@ -770,16 +793,20 @@ func encodeData(s *Writer, is_last bool, force_flush bool) bool {
 			return true
 		}
 
+		storage = s.getStorage(int(2*bytes + 503))
+		storage[0] = byte(s.last_bytes_)
+		storage[1] = byte(s.last_bytes_ >> 8)
 		table = getHashTable(s, s.params.quality, uint(bytes), &table_size)
 		if s.params.quality == fastOnePassCompressionQuality {
-			compressFragmentFast(data[wrapped_last_processed_pos&mask:], uint(bytes), is_last, table, table_size, s.cmd_depths_[:], s.cmd_bits_[:], &s.cmd_code_numbits_, s.cmd_code_[:], &s.bw)
+			compressFragmentFast(data[wrapped_last_processed_pos&mask:], uint(bytes), is_last, table, table_size, s.cmd_depths_[:], s.cmd_bits_[:], &s.cmd_code_numbits_, s.cmd_code_[:], &storage_ix, storage)
 		} else {
-			compressFragmentTwoPass(data[wrapped_last_processed_pos&mask:], uint(bytes), is_last, s.command_buf_, s.literal_buf_, table, table_size, &s.bw)
+			compressFragmentTwoPass(data[wrapped_last_processed_pos&mask:], uint(bytes), is_last, s.command_buf_, s.literal_buf_, table, table_size, &storage_ix, storage)
 		}
 
+		s.last_bytes_ = uint16(storage[storage_ix>>3])
+		s.last_bytes_bits_ = byte(storage_ix & 7)
 		updateLastProcessedPos(s)
-		s.writeOutput(s.bw.dst)
-		s.bw.dst = s.bw.dst[:0]
+		s.writeOutput(storage[:storage_ix>>3])
 		return true
 	}
 	{
@@ -856,7 +883,13 @@ func encodeData(s *Writer, is_last bool, force_flush bool) bool {
 	assert(s.input_pos_-s.last_flush_pos_ <= 1<<24)
 	{
 		var metablock_size uint32 = uint32(s.input_pos_ - s.last_flush_pos_)
-		writeMetaBlockInternal(data, uint(mask), s.last_flush_pos_, uint(metablock_size), is_last, literal_context_mode, &s.params, s.prev_byte_, s.prev_byte2_, s.num_literals_, s.commands, s.saved_dist_cache_[:], s.dist_cache_[:], &s.bw)
+		var storage []byte = s.getStorage(int(2*metablock_size + 503))
+		var storage_ix uint = uint(s.last_bytes_bits_)
+		storage[0] = byte(s.last_bytes_)
+		storage[1] = byte(s.last_bytes_ >> 8)
+		writeMetaBlockInternal(data, uint(mask), s.last_flush_pos_, uint(metablock_size), is_last, literal_context_mode, &s.params, s.prev_byte_, s.prev_byte2_, s.num_literals_, s.commands, s.saved_dist_cache_[:], s.dist_cache_[:], &storage_ix, storage)
+		s.last_bytes_ = uint16(storage[storage_ix>>3])
+		s.last_bytes_bits_ = byte(storage_ix & 7)
 		s.last_flush_pos_ = s.input_pos_
 		if updateLastProcessedPos(s) {
 			hasherReset(s.hasher_)
@@ -877,22 +910,27 @@ func encodeData(s *Writer, is_last bool, force_flush bool) bool {
 		   emitting an uncompressed block. */
 		copy(s.saved_dist_cache_[:], s.dist_cache_[:])
 
-		s.writeOutput(s.bw.dst)
-		s.bw.dst = s.bw.dst[:0]
+		s.writeOutput(storage[:storage_ix>>3])
 		return true
 	}
 }
 
-/* Dumps remaining output bits and metadata header to s.bw.
+/* Dumps remaining output bits and metadata header to |header|.
+   Returns number of produced bytes.
+   REQUIRED: |header| should be 8-byte aligned and at least 16 bytes long.
    REQUIRED: |block_size| <= (1 << 24). */
-func writeMetadataHeader(s *Writer, block_size uint) {
-	bw := &s.bw
-
-	bw.writeBits(1, 0)
-	bw.writeBits(2, 3)
-	bw.writeBits(1, 0)
+func writeMetadataHeader(s *Writer, block_size uint, header []byte) uint {
+	storage_ix := uint(s.last_bytes_bits_)
+	header[0] = byte(s.last_bytes_)
+	header[1] = byte(s.last_bytes_ >> 8)
+	s.last_bytes_ = 0
+	s.last_bytes_bits_ = 0
+
+	writeBits(1, 0, &storage_ix, header)
+	writeBits(2, 3, &storage_ix, header)
+	writeBits(1, 0, &storage_ix, header)
 	if block_size == 0 {
-		bw.writeBits(2, 0)
+		writeBits(2, 0, &storage_ix, header)
 	} else {
 		var nbits uint32
 		if block_size == 1 {
@@ -901,19 +939,34 @@ func writeMetadataHeader(s *Writer, block_size uint) {
 			nbits = log2FloorNonZero(uint(uint32(block_size)-1)) + 1
 		}
 		var nbytes uint32 = (nbits + 7) / 8
-		bw.writeBits(2, uint64(nbytes))
-		bw.writeBits(uint(8*nbytes), uint64(block_size)-1)
+		writeBits(2, uint64(nbytes), &storage_ix, header)
+		writeBits(uint(8*nbytes), uint64(block_size)-1, &storage_ix, header)
 	}
 
-	bw.jumpToByteBoundary()
+	return (storage_ix + 7) >> 3
 }
 
 func injectBytePaddingBlock(s *Writer) {
+	var seal uint32 = uint32(s.last_bytes_)
+	var seal_bits uint = uint(s.last_bytes_bits_)
+	s.last_bytes_ = 0
+	s.last_bytes_bits_ = 0
+
 	/* is_last = 0, data_nibbles = 11, reserved = 0, meta_nibbles = 00 */
-	s.bw.writeBits(6, 0x6)
-	s.bw.jumpToByteBoundary()
-	s.writeOutput(s.bw.dst)
-	s.bw.dst = s.bw.dst[:0]
+	seal |= 0x6 << seal_bits
+
+	seal_bits += 6
+
+	destination := s.tiny_buf_.u8[:]
+
+	destination[0] = byte(seal)
+	if seal_bits > 8 {
+		destination[1] = byte(seal >> 8)
+	}
+	if seal_bits > 16 {
+		destination[2] = byte(seal >> 16)
+	}
+	s.writeOutput(destination[:(seal_bits+7)>>3])
 }
 
 func checkFlushComplete(s *Writer) {
@@ -945,7 +998,7 @@ func encoderCompressStreamFast(s *Writer, op int, available_in *uint, next_in *[
 	}
 
 	for {
-		if s.stream_state_ == streamFlushRequested && s.bw.nbits&7 != 0 {
+		if s.stream_state_ == streamFlushRequested && s.last_bytes_bits_ != 0 {
 			injectBytePaddingBlock(s)
 			continue
 		}
@@ -957,6 +1010,9 @@ func encoderCompressStreamFast(s *Writer, op int, available_in *uint, next_in *[
 			var block_size uint = brotli_min_size_t(block_size_limit, *available_in)
 			var is_last bool = (*available_in == block_size) && (op == int(operationFinish))
 			var force_flush bool = (*available_in == block_size) && (op == int(operationFlush))
+			var max_out_size uint = 2*block_size + 503
+			var storage []byte = nil
+			var storage_ix uint = uint(s.last_bytes_bits_)
 			var table_size uint
 			var table []int
 
@@ -965,18 +1021,25 @@ func encoderCompressStreamFast(s *Writer, op int, available_in *uint, next_in *[
 				continue
 			}
 
+			storage = s.getStorage(int(max_out_size))
+
+			storage[0] = byte(s.last_bytes_)
+			storage[1] = byte(s.last_bytes_ >> 8)
 			table = getHashTable(s, s.params.quality, block_size, &table_size)
 
 			if s.params.quality == fastOnePassCompressionQuality {
-				compressFragmentFast(*next_in, block_size, is_last, table, table_size, s.cmd_depths_[:], s.cmd_bits_[:], &s.cmd_code_numbits_, s.cmd_code_[:], &s.bw)
+				compressFragmentFast(*next_in, block_size, is_last, table, table_size, s.cmd_depths_[:], s.cmd_bits_[:], &s.cmd_code_numbits_, s.cmd_code_[:], &storage_ix, storage)
 			} else {
-				compressFragmentTwoPass(*next_in, block_size, is_last, command_buf, literal_buf, table, table_size, &s.bw)
+				compressFragmentTwoPass(*next_in, block_size, is_last, command_buf, literal_buf, table, table_size, &storage_ix, storage)
 			}
 
 			*next_in = (*next_in)[block_size:]
 			*available_in -= block_size
-			s.writeOutput(s.bw.dst)
-			s.bw.dst = s.bw.dst[:0]
+			var out_bytes uint = storage_ix >> 3
+			s.writeOutput(storage[:out_bytes])
+
+			s.last_bytes_ = uint16(storage[storage_ix>>3])
+			s.last_bytes_bits_ = byte(storage_ix & 7)
 
 			if force_flush {
 				s.stream_state_ = streamFlushRequested
@@ -1010,7 +1073,7 @@ func processMetadata(s *Writer, available_in *uint, next_in *[]byte) bool {
 	}
 
 	for {
-		if s.stream_state_ == streamFlushRequested && s.bw.nbits&7 != 0 {
+		if s.stream_state_ == streamFlushRequested && s.last_bytes_bits_ != 0 {
 			injectBytePaddingBlock(s)
 			continue
 		}
@@ -1024,9 +1087,8 @@ func processMetadata(s *Writer, available_in *uint, next_in *[]byte) bool {
 		}
 
 		if s.stream_state_ == streamMetadataHead {
-			writeMetadataHeader(s, uint(s.remaining_metadata_bytes_))
-			s.writeOutput(s.bw.dst)
-			s.bw.dst = s.bw.dst[:0]
+			n := writeMetadataHeader(s, uint(s.remaining_metadata_bytes_), s.tiny_buf_.u8[:])
+			s.writeOutput(s.tiny_buf_.u8[:n])
 			s.stream_state_ = streamMetadataBody
 			continue
 		} else {
@@ -1112,7 +1174,7 @@ func encoderCompressStream(s *Writer, op int, available_in *uint, next_in *[]byt
 			continue
 		}
 
-		if s.stream_state_ == streamFlushRequested && s.bw.nbits&7 != 0 {
+		if s.stream_state_ == streamFlushRequested && s.last_bytes_bits_ != 0 {
 			injectBytePaddingBlock(s)
 			continue
 		}
diff --git a/vendor/github.com/andybalholm/brotli/encoder.go b/vendor/github.com/andybalholm/brotli/encoder.go
new file mode 100644
index 0000000000..1928382596
--- /dev/null
+++ b/vendor/github.com/andybalholm/brotli/encoder.go
@@ -0,0 +1,177 @@
+package brotli
+
+import "github.com/andybalholm/brotli/matchfinder"
+
+// An Encoder implements the matchfinder.Encoder interface, writing in Brotli format.
+type Encoder struct {
+	wroteHeader bool
+	bw          bitWriter
+	distCache   []distanceCode
+}
+
+func (e *Encoder) Reset() {
+	e.wroteHeader = false
+	e.bw = bitWriter{}
+}
+
+func (e *Encoder) Encode(dst []byte, src []byte, matches []matchfinder.Match, lastBlock bool) []byte {
+	e.bw.dst = dst
+	if !e.wroteHeader {
+		e.bw.writeBits(4, 15)
+		e.wroteHeader = true
+	}
+
+	if len(src) == 0 {
+		if lastBlock {
+			e.bw.writeBits(2, 3) // islast + isempty
+			e.bw.jumpToByteBoundary()
+			return e.bw.dst
+		}
+		return dst
+	}
+
+	var literalHisto [256]uint32
+	var commandHisto [704]uint32
+	var distanceHisto [64]uint32
+	literalCount := 0
+	commandCount := 0
+	distanceCount := 0
+
+	if len(e.distCache) < len(matches) {
+		e.distCache = make([]distanceCode, len(matches))
+	}
+
+	// first pass: build the histograms
+	pos := 0
+
+	// d is the ring buffer of the last 4 distances.
+	d := [4]int{-10, -10, -10, -10}
+	for i, m := range matches {
+		if m.Unmatched > 0 {
+			for _, c := range src[pos : pos+m.Unmatched] {
+				literalHisto[c]++
+			}
+			literalCount += m.Unmatched
+		}
+
+		insertCode := getInsertLengthCode(uint(m.Unmatched))
+		copyCode := getCopyLengthCode(uint(m.Length))
+		if m.Length == 0 {
+			// If the stream ends with unmatched bytes, we need a dummy copy length.
+			copyCode = 2
+		}
+		command := combineLengthCodes(insertCode, copyCode, false)
+		commandHisto[command]++
+		commandCount++
+
+		if command >= 128 && m.Length != 0 {
+			var distCode distanceCode
+			switch m.Distance {
+			case d[3]:
+				distCode.code = 0
+			case d[2]:
+				distCode.code = 1
+			case d[1]:
+				distCode.code = 2
+			case d[0]:
+				distCode.code = 3
+			case d[3] - 1:
+				distCode.code = 4
+			case d[3] + 1:
+				distCode.code = 5
+			case d[3] - 2:
+				distCode.code = 6
+			case d[3] + 2:
+				distCode.code = 7
+			case d[3] - 3:
+				distCode.code = 8
+			case d[3] + 3:
+				distCode.code = 9
+
+				// In my testing, codes 10–15 actually reduced the compression ratio.
+
+			default:
+				distCode = getDistanceCode(m.Distance)
+			}
+			e.distCache[i] = distCode
+			distanceHisto[distCode.code]++
+			distanceCount++
+			if distCode.code != 0 {
+				d[0], d[1], d[2], d[3] = d[1], d[2], d[3], m.Distance
+			}
+		}
+
+		pos += m.Unmatched + m.Length
+	}
+
+	storeMetaBlockHeaderBW(uint(len(src)), false, &e.bw)
+	e.bw.writeBits(13, 0)
+
+	var literalDepths [256]byte
+	var literalBits [256]uint16
+	buildAndStoreHuffmanTreeFastBW(literalHisto[:], uint(literalCount), 8, literalDepths[:], literalBits[:], &e.bw)
+
+	var commandDepths [704]byte
+	var commandBits [704]uint16
+	buildAndStoreHuffmanTreeFastBW(commandHisto[:], uint(commandCount), 10, commandDepths[:], commandBits[:], &e.bw)
+
+	var distanceDepths [64]byte
+	var distanceBits [64]uint16
+	buildAndStoreHuffmanTreeFastBW(distanceHisto[:], uint(distanceCount), 6, distanceDepths[:], distanceBits[:], &e.bw)
+
+	pos = 0
+	for i, m := range matches {
+		insertCode := getInsertLengthCode(uint(m.Unmatched))
+		copyCode := getCopyLengthCode(uint(m.Length))
+		if m.Length == 0 {
+			// If the stream ends with unmatched bytes, we need a dummy copy length.
+			copyCode = 2
+		}
+		command := combineLengthCodes(insertCode, copyCode, false)
+		e.bw.writeBits(uint(commandDepths[command]), uint64(commandBits[command]))
+		if kInsExtra[insertCode] > 0 {
+			e.bw.writeBits(uint(kInsExtra[insertCode]), uint64(m.Unmatched)-uint64(kInsBase[insertCode]))
+		}
+		if kCopyExtra[copyCode] > 0 {
+			e.bw.writeBits(uint(kCopyExtra[copyCode]), uint64(m.Length)-uint64(kCopyBase[copyCode]))
+		}
+
+		if m.Unmatched > 0 {
+			for _, c := range src[pos : pos+m.Unmatched] {
+				e.bw.writeBits(uint(literalDepths[c]), uint64(literalBits[c]))
+			}
+		}
+
+		if command >= 128 && m.Length != 0 {
+			distCode := e.distCache[i]
+			e.bw.writeBits(uint(distanceDepths[distCode.code]), uint64(distanceBits[distCode.code]))
+			if distCode.nExtra > 0 {
+				e.bw.writeBits(distCode.nExtra, distCode.extraBits)
+			}
+		}
+
+		pos += m.Unmatched + m.Length
+	}
+
+	if lastBlock {
+		e.bw.writeBits(2, 3) // islast + isempty
+		e.bw.jumpToByteBoundary()
+	}
+	return e.bw.dst
+}
+
+type distanceCode struct {
+	code      int
+	nExtra    uint
+	extraBits uint64
+}
+
+func getDistanceCode(distance int) distanceCode {
+	d := distance + 3
+	nbits := log2FloorNonZero(uint(d)) - 1
+	prefix := (d >> nbits) & 1
+	offset := (2 + prefix) << nbits
+	distcode := int(2*(nbits-1)) + prefix + 16
+	extra := d - offset
+	return distanceCode{distcode, uint(nbits), uint64(extra)}
+}
diff --git a/vendor/github.com/andybalholm/brotli/entropy_encode_static.go b/vendor/github.com/andybalholm/brotli/entropy_encode_static.go
index 2543f8f07d..294aff4f4e 100644
--- a/vendor/github.com/andybalholm/brotli/entropy_encode_static.go
+++ b/vendor/github.com/andybalholm/brotli/entropy_encode_static.go
@@ -778,7 +778,11 @@ var kStaticDistanceCodeDepth = [64]byte{
 
 var kCodeLengthBits = [18]uint32{0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 15, 31, 0, 11, 7}
 
-func storeStaticCodeLengthCode(bw *bitWriter) {
+func storeStaticCodeLengthCode(storage_ix *uint, storage []byte) {
+	writeBits(40, 0x0000FF55555554, storage_ix, storage)
+}
+
+func storeStaticCodeLengthCodeBW(bw *bitWriter) {
 	bw.writeBits(32, 0x55555554)
 	bw.writeBits(8, 0xFF)
 }
@@ -4318,10 +4322,9 @@ var kStaticCommandCodeBits = [numCommandSymbols]uint16{
 	2047,
 }
 
-func storeStaticCommandHuffmanTree(bw *bitWriter) {
-	bw.writeBits(32, 0x16307003)
-	bw.writeBits(24, 0x926244)
-	bw.writeBits(3, 0x00000000)
+func storeStaticCommandHuffmanTree(storage_ix *uint, storage []byte) {
+	writeBits(56, 0x92624416307003, storage_ix, storage)
+	writeBits(3, 0x00000000, storage_ix, storage)
 }
 
 var kStaticDistanceCodeBits = [64]uint16{
@@ -4391,6 +4394,6 @@ var kStaticDistanceCodeBits = [64]uint16{
 	63,
 }
 
-func storeStaticDistanceHuffmanTree(bw *bitWriter) {
-	bw.writeBits(28, 0x0369DC03)
+func storeStaticDistanceHuffmanTree(storage_ix *uint, storage []byte) {
+	writeBits(28, 0x0369DC03, storage_ix, storage)
 }
diff --git a/vendor/github.com/andybalholm/brotli/fast_log.go b/vendor/github.com/andybalholm/brotli/fast_log.go
index bbae3009be..9d6607f7e2 100644
--- a/vendor/github.com/andybalholm/brotli/fast_log.go
+++ b/vendor/github.com/andybalholm/brotli/fast_log.go
@@ -1,6 +1,9 @@
 package brotli
 
-import "math"
+import (
+	"math"
+	"math/bits"
+)
 
 /* Copyright 2013 Google Inc. All Rights Reserved.
 
@@ -11,16 +14,7 @@ import "math"
 /* Utilities for fast computation of logarithms. */
 
 func log2FloorNonZero(n uint) uint32 {
-	/* TODO: generalize and move to platform.h */
-	var result uint32 = 0
-	for {
-		n >>= 1
-		if n == 0 {
-			break
-		}
-		result++
-	}
-	return result
+	return uint32(bits.Len(n)) - 1
 }
 
 /* A lookup table for small values of log2(int) to be used in entropy
diff --git a/vendor/github.com/andybalholm/brotli/hash.go b/vendor/github.com/andybalholm/brotli/hash.go
index 003b433ea6..00f812e87e 100644
--- a/vendor/github.com/andybalholm/brotli/hash.go
+++ b/vendor/github.com/andybalholm/brotli/hash.go
@@ -29,8 +29,6 @@ type hasherHandle interface {
 	Store(data []byte, mask uint, ix uint)
 }
 
-type score_t uint
-
 const kCutoffTransformsCount uint32 = 10
 
 /*   0,  12,   27,    23,    42,    63,    56,    48,    59,    64 */
diff --git a/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go b/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go
index 3364c44bd5..306e46d3db 100644
--- a/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go
+++ b/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go
@@ -110,8 +110,7 @@ func (h *hashForgetfulChain) Prepare(one_shot bool, input_size uint, data []byte
 func (h *hashForgetfulChain) Store(data []byte, mask uint, ix uint) {
 	var key uint = h.HashBytes(data[ix&mask:])
 	var bank uint = key & (h.numBanks - 1)
-	var idx uint
-	idx = uint(h.free_slot_idx[bank]) & ((1 << h.bankBits) - 1)
+	idx := uint(h.free_slot_idx[bank]) & ((1 << h.bankBits) - 1)
 	h.free_slot_idx[bank]++
 	var delta uint = ix - uint(h.addr[key])
 	h.tiny_hash[uint16(ix)] = byte(key)
diff --git a/vendor/github.com/andybalholm/brotli/hash_rolling.go b/vendor/github.com/andybalholm/brotli/hash_rolling.go
index ad655a0a5b..6630fc07e4 100644
--- a/vendor/github.com/andybalholm/brotli/hash_rolling.go
+++ b/vendor/github.com/andybalholm/brotli/hash_rolling.go
@@ -48,7 +48,6 @@ type hashRolling struct {
 	state         uint32
 	table         []uint32
 	next_ix       uint
-	chunk_len     uint32
 	factor        uint32
 	factor_remove uint32
 }
diff --git a/vendor/github.com/andybalholm/brotli/http.go b/vendor/github.com/andybalholm/brotli/http.go
index af58670f2c..3d3a8a06fd 100644
--- a/vendor/github.com/andybalholm/brotli/http.go
+++ b/vendor/github.com/andybalholm/brotli/http.go
@@ -11,15 +11,7 @@ import (
 // the Accept-Encoding header, sets the Content-Encoding header, and returns a
 // WriteCloser that implements that compression. The Close method must be called
 // before the current HTTP handler returns.
-//
-// Due to https://github.com/golang/go/issues/31753, the response will not be
-// compressed unless you set a Content-Type header before you call
-// HTTPCompressor.
 func HTTPCompressor(w http.ResponseWriter, r *http.Request) io.WriteCloser {
-	if w.Header().Get("Content-Type") == "" {
-		return nopCloser{w}
-	}
-
 	if w.Header().Get("Vary") == "" {
 		w.Header().Set("Vary", "Accept-Encoding")
 	}
@@ -28,7 +20,7 @@ func HTTPCompressor(w http.ResponseWriter, r *http.Request) io.WriteCloser {
 	switch encoding {
 	case "br":
 		w.Header().Set("Content-Encoding", "br")
-		return NewWriter(w)
+		return NewWriterV2(w, DefaultCompression)
 	case "gzip":
 		w.Header().Set("Content-Encoding", "gzip")
 		return gzip.NewWriter(w)
@@ -180,8 +172,8 @@ func init() {
 		var t octetType
 		isCtl := c <= 31 || c == 127
 		isChar := 0 <= c && c <= 127
-		isSeparator := strings.IndexRune(" \t\"(),/:;<=>?@[]\\{}", rune(c)) >= 0
-		if strings.IndexRune(" \t\r\n", rune(c)) >= 0 {
+		isSeparator := strings.ContainsRune(" \t\"(),/:;<=>?@[]\\{}", rune(c))
+		if strings.ContainsRune(" \t\r\n", rune(c)) {
 			t |= isSpace
 		}
 		if isChar && !isCtl && !isSeparator {
diff --git a/vendor/github.com/andybalholm/brotli/matchfinder/emitter.go b/vendor/github.com/andybalholm/brotli/matchfinder/emitter.go
new file mode 100644
index 0000000000..507d1cae64
--- /dev/null
+++ b/vendor/github.com/andybalholm/brotli/matchfinder/emitter.go
@@ -0,0 +1,34 @@
+package matchfinder
+
+// An absoluteMatch is like a Match, but it stores indexes into the byte
+// stream instead of lengths.
+type absoluteMatch struct {
+	// Start is the index of the first byte.
+	Start int
+
+	// End is the index of the byte after the last byte
+	// (so that End - Start = Length).
+	End int
+
+	// Match is the index of the previous data that matches
+	// (Start - Match = Distance).
+	Match int
+}
+
+// A matchEmitter manages the output of matches for a MatchFinder.
+type matchEmitter struct {
+	// Dst is the destination slice that Matches are added to.
+	Dst []Match
+
+	// NextEmit is the index of the next byte to emit.
+	NextEmit int
+}
+
+func (e *matchEmitter) emit(m absoluteMatch) {
+	e.Dst = append(e.Dst, Match{
+		Unmatched: m.Start - e.NextEmit,
+		Length:    m.End - m.Start,
+		Distance:  m.Start - m.Match,
+	})
+	e.NextEmit = m.End
+}
diff --git a/vendor/github.com/andybalholm/brotli/matchfinder/m0.go b/vendor/github.com/andybalholm/brotli/matchfinder/m0.go
new file mode 100644
index 0000000000..773b7c49f3
--- /dev/null
+++ b/vendor/github.com/andybalholm/brotli/matchfinder/m0.go
@@ -0,0 +1,169 @@
+package matchfinder
+
+import (
+	"encoding/binary"
+)
+
+// M0 is an implementation of the MatchFinder interface based
+// on the algorithm used by snappy, but modified to be more like the algorithm
+// used by compression level 0 of the brotli reference implementation.
+//
+// It has a maximum block size of 65536 bytes.
+type M0 struct {
+	// Lazy turns on "lazy matching," for higher compression but less speed.
+	Lazy bool
+
+	MaxDistance int
+	MaxLength   int
+}
+
+func (M0) Reset() {}
+
+const (
+	m0HashLen = 5
+
+	m0TableBits = 14
+	m0TableSize = 1 << m0TableBits
+	m0Shift     = 32 - m0TableBits
+	// m0TableMask is redundant, but helps the compiler eliminate bounds
+	// checks.
+	m0TableMask = m0TableSize - 1
+)
+
+func (m M0) hash(data uint64) uint64 {
+	hash := (data << (64 - 8*m0HashLen)) * hashMul64
+	return hash >> (64 - m0TableBits)
+}
+
+// FindMatches looks for matches in src, appends them to dst, and returns dst.
+// src must not be longer than 65536 bytes.
+func (m M0) FindMatches(dst []Match, src []byte) []Match {
+	const inputMargin = 16 - 1
+	const minNonLiteralBlockSize = 1 + 1 + inputMargin
+
+	if len(src) < minNonLiteralBlockSize {
+		dst = append(dst, Match{
+			Unmatched: len(src),
+		})
+		return dst
+	}
+	if len(src) > 65536 {
+		panic("block too long")
+	}
+
+	var table [m0TableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	nextHash := m.hash(binary.LittleEndian.Uint64(src[s:]))
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := 32
+
+		nextS := s
+		candidate := 0
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = int(table[nextHash&m0TableMask])
+			table[nextHash&m0TableMask] = uint16(s)
+			nextHash = m.hash(binary.LittleEndian.Uint64(src[nextS:]))
+			if m.MaxDistance != 0 && s-candidate > m.MaxDistance {
+				continue
+			}
+			if binary.LittleEndian.Uint32(src[s:]) == binary.LittleEndian.Uint32(src[candidate:]) {
+				break
+			}
+		}
+
+		// Invariant: we have a 4-byte match at s.
+		base := s
+		s = extendMatch(src, candidate+4, s+4)
+
+		origBase := base
+		if m.Lazy && base+1 < sLimit {
+			newBase := base + 1
+			h := m.hash(binary.LittleEndian.Uint64(src[newBase:]))
+			newCandidate := int(table[h&m0TableMask])
+			table[h&m0TableMask] = uint16(newBase)
+			okDistance := true
+			if m.MaxDistance != 0 && newBase-newCandidate > m.MaxDistance {
+				okDistance = false
+			}
+			if okDistance && binary.LittleEndian.Uint32(src[newBase:]) == binary.LittleEndian.Uint32(src[newCandidate:]) {
+				newS := extendMatch(src, newCandidate+4, newBase+4)
+				if newS-newBase > s-base+1 {
+					s = newS
+					base = newBase
+					candidate = newCandidate
+				}
+			}
+		}
+
+		if m.MaxLength != 0 && s-base > m.MaxLength {
+			s = base + m.MaxLength
+		}
+		dst = append(dst, Match{
+			Unmatched: base - nextEmit,
+			Length:    s - base,
+			Distance:  base - candidate,
+		})
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if m.Lazy {
+			// If lazy matching is enabled, we update the hash table for
+			// every byte in the match.
+			for i := origBase + 2; i < s-1; i++ {
+				x := binary.LittleEndian.Uint64(src[i:])
+				table[m.hash(x)&m0TableMask] = uint16(i)
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := binary.LittleEndian.Uint64(src[s-1:])
+		prevHash := m.hash(x >> 0)
+		table[prevHash&m0TableMask] = uint16(s - 1)
+		nextHash = m.hash(x >> 8)
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		dst = append(dst, Match{
+			Unmatched: len(src) - nextEmit,
+		})
+	}
+	return dst
+}
diff --git a/vendor/github.com/andybalholm/brotli/matchfinder/m4.go b/vendor/github.com/andybalholm/brotli/matchfinder/m4.go
new file mode 100644
index 0000000000..818947255d
--- /dev/null
+++ b/vendor/github.com/andybalholm/brotli/matchfinder/m4.go
@@ -0,0 +1,308 @@
+package matchfinder
+
+import (
+	"encoding/binary"
+	"math/bits"
+	"runtime"
+)
+
+// M4 is an implementation of the MatchFinder
+// interface that uses a hash table to find matches,
+// optional match chains,
+// and the advanced parsing technique from
+// https://fastcompression.blogspot.com/2011/12/advanced-parsing-strategies.html.
+type M4 struct {
+	// MaxDistance is the maximum distance (in bytes) to look back for
+	// a match. The default is 65535.
+	MaxDistance int
+
+	// MinLength is the length of the shortest match to return.
+	// The default is 4.
+	MinLength int
+
+	// HashLen is the number of bytes to use to calculate the hashes.
+	// The maximum is 8 and the default is 6.
+	HashLen int
+
+	// TableBits is the number of bits in the hash table indexes.
+	// The default is 17 (128K entries).
+	TableBits int
+
+	// ChainLength is how many entries to search on the "match chain" of older
+	// locations with the same hash as the current location.
+	ChainLength int
+
+	// DistanceBitCost is used when comparing two matches to see
+	// which is better. The comparison is primarily based on the length
+	// of the matches, but it can also take the distance into account,
+	// in terms of the number of bits needed to represent the distance.
+	// One byte of length is given a score of 256, so 32 (256/8) would
+	// be a reasonable first guess for the value of one bit.
+	// (The default is 0, which bases the comparison solely on length.)
+	DistanceBitCost int
+
+	table []uint32
+	chain []uint16
+
+	history []byte
+}
+
+func (q *M4) Reset() {
+	for i := range q.table {
+		q.table[i] = 0
+	}
+	q.history = q.history[:0]
+	q.chain = q.chain[:0]
+}
+
+func (q *M4) score(m absoluteMatch) int {
+	return (m.End-m.Start)*256 + (bits.LeadingZeros32(uint32(m.Start-m.Match))-32)*q.DistanceBitCost
+}
+
+func (q *M4) FindMatches(dst []Match, src []byte) []Match {
+	if q.MaxDistance == 0 {
+		q.MaxDistance = 65535
+	}
+	if q.MinLength == 0 {
+		q.MinLength = 4
+	}
+	if q.HashLen == 0 {
+		q.HashLen = 6
+	}
+	if q.TableBits == 0 {
+		q.TableBits = 17
+	}
+	if len(q.table) < 1<<q.TableBits {
+		q.table = make([]uint32, 1<<q.TableBits)
+	}
+
+	e := matchEmitter{Dst: dst}
+
+	if len(q.history) > q.MaxDistance*2 {
+		// Trim down the history buffer.
+		delta := len(q.history) - q.MaxDistance
+		copy(q.history, q.history[delta:])
+		q.history = q.history[:q.MaxDistance]
+		if q.ChainLength > 0 {
+			q.chain = q.chain[:q.MaxDistance]
+		}
+
+		for i, v := range q.table {
+			newV := int(v) - delta
+			if newV < 0 {
+				newV = 0
+			}
+			q.table[i] = uint32(newV)
+		}
+	}
+
+	// Append src to the history buffer.
+	e.NextEmit = len(q.history)
+	q.history = append(q.history, src...)
+	if q.ChainLength > 0 {
+		q.chain = append(q.chain, make([]uint16, len(src))...)
+	}
+	src = q.history
+
+	// matches stores the matches that have been found but not emitted,
+	// in reverse order. (matches[0] is the most recent one.)
+	var matches [3]absoluteMatch
+	for i := e.NextEmit; i < len(src)-7; i++ {
+		if matches[0] != (absoluteMatch{}) && i >= matches[0].End {
+			// We have found some matches, and we're far enough along that we probably
+			// won't find overlapping matches, so we might as well emit them.
+			if matches[1] != (absoluteMatch{}) {
+				if matches[1].End > matches[0].Start {
+					matches[1].End = matches[0].Start
+				}
+				if matches[1].End-matches[1].Start >= q.MinLength && q.score(matches[1]) > 0 {
+					e.emit(matches[1])
+				}
+			}
+			e.emit(matches[0])
+			matches = [3]absoluteMatch{}
+		}
+
+		// Calculate and store the hash.
+		h := ((binary.LittleEndian.Uint64(src[i:]) & (1<<(8*q.HashLen) - 1)) * hashMul64) >> (64 - q.TableBits)
+		candidate := int(q.table[h])
+		q.table[h] = uint32(i)
+		if q.ChainLength > 0 && candidate != 0 {
+			delta := i - candidate
+			if delta < 1<<16 {
+				q.chain[i] = uint16(delta)
+			}
+		}
+
+		if i < matches[0].End && i != matches[0].End+2-q.HashLen {
+			continue
+		}
+		if candidate == 0 || i-candidate > q.MaxDistance {
+			continue
+		}
+
+		// Look for a match.
+		var currentMatch absoluteMatch
+
+		if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
+			m := extendMatch2(src, i, candidate, e.NextEmit)
+			if m.End-m.Start > q.MinLength && q.score(m) > 0 {
+				currentMatch = m
+			}
+		}
+
+		for j := 0; j < q.ChainLength; j++ {
+			delta := q.chain[candidate]
+			if delta == 0 {
+				break
+			}
+			candidate -= int(delta)
+			if candidate <= 0 || i-candidate > q.MaxDistance {
+				break
+			}
+			if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
+				m := extendMatch2(src, i, candidate, e.NextEmit)
+				if m.End-m.Start > q.MinLength && q.score(m) > q.score(currentMatch) {
+					currentMatch = m
+				}
+			}
+		}
+
+		if currentMatch.End-currentMatch.Start < q.MinLength {
+			continue
+		}
+
+		overlapPenalty := 0
+		if matches[0] != (absoluteMatch{}) {
+			overlapPenalty = 275
+			if currentMatch.Start <= matches[1].End {
+				// This match would completely replace the previous match,
+				// so there is no penalty for overlap.
+				overlapPenalty = 0
+			}
+		}
+
+		if q.score(currentMatch) <= q.score(matches[0])+overlapPenalty {
+			continue
+		}
+
+		matches = [3]absoluteMatch{
+			currentMatch,
+			matches[0],
+			matches[1],
+		}
+
+		if matches[2] == (absoluteMatch{}) {
+			continue
+		}
+
+		// We have three matches, so it's time to emit one and/or eliminate one.
+		switch {
+		case matches[0].Start < matches[2].End:
+			// The first and third matches overlap; discard the one in between.
+			matches = [3]absoluteMatch{
+				matches[0],
+				matches[2],
+				absoluteMatch{},
+			}
+
+		case matches[0].Start < matches[2].End+q.MinLength:
+			// The first and third matches don't overlap, but there's no room for
+			// another match between them. Emit the first match and discard the second.
+			e.emit(matches[2])
+			matches = [3]absoluteMatch{
+				matches[0],
+				absoluteMatch{},
+				absoluteMatch{},
+			}
+
+		default:
+			// Emit the first match, shortening it if necessary to avoid overlap with the second.
+			if matches[2].End > matches[1].Start {
+				matches[2].End = matches[1].Start
+			}
+			if matches[2].End-matches[2].Start >= q.MinLength && q.score(matches[2]) > 0 {
+				e.emit(matches[2])
+			}
+			matches[2] = absoluteMatch{}
+		}
+	}
+
+	// We've found all the matches now; emit the remaining ones.
+	if matches[1] != (absoluteMatch{}) {
+		if matches[1].End > matches[0].Start {
+			matches[1].End = matches[0].Start
+		}
+		if matches[1].End-matches[1].Start >= q.MinLength && q.score(matches[1]) > 0 {
+			e.emit(matches[1])
+		}
+	}
+	if matches[0] != (absoluteMatch{}) {
+		e.emit(matches[0])
+	}
+
+	dst = e.Dst
+	if e.NextEmit < len(src) {
+		dst = append(dst, Match{
+			Unmatched: len(src) - e.NextEmit,
+		})
+	}
+
+	return dst
+}
+
+const hashMul64 = 0x1E35A7BD1E35A7BD
+
+// extendMatch returns the largest k such that k <= len(src) and that
+// src[i:i+k-j] and src[j:k] have the same contents.
+//
+// It assumes that:
+//
+//	0 <= i && i < j && j <= len(src)
+func extendMatch(src []byte, i, j int) int {
+	switch runtime.GOARCH {
+	case "amd64":
+		// As long as we are 8 or more bytes before the end of src, we can load and
+		// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+		for j+8 < len(src) {
+			iBytes := binary.LittleEndian.Uint64(src[i:])
+			jBytes := binary.LittleEndian.Uint64(src[j:])
+			if iBytes != jBytes {
+				// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+				// the index of the first byte that differs. The BSF instruction finds the
+				// least significant 1 bit, the amd64 architecture is little-endian, and
+				// the shift by 3 converts a bit index to a byte index.
+				return j + bits.TrailingZeros64(iBytes^jBytes)>>3
+			}
+			i, j = i+8, j+8
+		}
+	case "386":
+		// On a 32-bit CPU, we do it 4 bytes at a time.
+		for j+4 < len(src) {
+			iBytes := binary.LittleEndian.Uint32(src[i:])
+			jBytes := binary.LittleEndian.Uint32(src[j:])
+			if iBytes != jBytes {
+				return j + bits.TrailingZeros32(iBytes^jBytes)>>3
+			}
+			i, j = i+4, j+4
+		}
+	}
+	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
+	}
+	return j
+}
+
+// Given a 4-byte match at src[start] and src[candidate], extendMatch2 extends it
+// upward as far as possible, and downward no farther than to min.
+func extendMatch2(src []byte, start, candidate, min int) absoluteMatch {
+	end := extendMatch(src, candidate+4, start+4)
+	for start > min && candidate > 0 && src[start-1] == src[candidate-1] {
+		start--
+		candidate--
+	}
+	return absoluteMatch{
+		Start: start,
+		End:   end,
+		Match: candidate,
+	}
+}
diff --git a/vendor/github.com/andybalholm/brotli/matchfinder/matchfinder.go b/vendor/github.com/andybalholm/brotli/matchfinder/matchfinder.go
new file mode 100644
index 0000000000..f6bcfdb39c
--- /dev/null
+++ b/vendor/github.com/andybalholm/brotli/matchfinder/matchfinder.go
@@ -0,0 +1,103 @@
+// The matchfinder package defines reusable components for data compression.
+//
+// Many compression libraries have two main parts:
+//   - Something that looks for repeated sequences of bytes
+//   - An encoder for the compressed data format (often an entropy coder)
+//
+// Although these are logically two separate steps, the implementations are
+// usually closely tied together. You can't use flate's matcher with snappy's
+// encoder, for example. This package defines interfaces and an intermediate
+// representation to allow mixing and matching compression components.
+package matchfinder
+
+import "io"
+
+// A Match is the basic unit of LZ77 compression.
+type Match struct {
+	Unmatched int // the number of unmatched bytes since the previous match
+	Length    int // the number of bytes in the matched string; it may be 0 at the end of the input
+	Distance  int // how far back in the stream to copy from
+}
+
+// A MatchFinder performs the LZ77 stage of compression, looking for matches.
+type MatchFinder interface {
+	// FindMatches looks for matches in src, appends them to dst, and returns dst.
+	FindMatches(dst []Match, src []byte) []Match
+
+	// Reset clears any internal state, preparing the MatchFinder to be used with
+	// a new stream.
+	Reset()
+}
+
+// An Encoder encodes the data in its final format.
+type Encoder interface {
+	// Encode appends the encoded format of src to dst, using the match
+	// information from matches.
+	Encode(dst []byte, src []byte, matches []Match, lastBlock bool) []byte
+
+	// Reset clears any internal state, preparing the Encoder to be used with
+	// a new stream.
+	Reset()
+}
+
+// A Writer uses MatchFinder and Encoder to write compressed data to Dest.
+type Writer struct {
+	Dest        io.Writer
+	MatchFinder MatchFinder
+	Encoder     Encoder
+
+	// BlockSize is the number of bytes to compress at a time. If it is zero,
+	// each Write operation will be treated as one block.
+	BlockSize int
+
+	err     error
+	inBuf   []byte
+	outBuf  []byte
+	matches []Match
+}
+
+func (w *Writer) Write(p []byte) (n int, err error) {
+	if w.err != nil {
+		return 0, w.err
+	}
+
+	if w.BlockSize == 0 {
+		return w.writeBlock(p, false)
+	}
+
+	w.inBuf = append(w.inBuf, p...)
+	var pos int
+	for pos = 0; pos+w.BlockSize <= len(w.inBuf) && w.err == nil; pos += w.BlockSize {
+		w.writeBlock(w.inBuf[pos:pos+w.BlockSize], false)
+	}
+	if pos > 0 {
+		n := copy(w.inBuf, w.inBuf[pos:])
+		w.inBuf = w.inBuf[:n]
+	}
+
+	return len(p), w.err
+}
+
+func (w *Writer) writeBlock(p []byte, lastBlock bool) (n int, err error) {
+	w.outBuf = w.outBuf[:0]
+	w.matches = w.MatchFinder.FindMatches(w.matches[:0], p)
+	w.outBuf = w.Encoder.Encode(w.outBuf, p, w.matches, lastBlock)
+	_, w.err = w.Dest.Write(w.outBuf)
+	return len(p), w.err
+}
+
+func (w *Writer) Close() error {
+	w.writeBlock(w.inBuf, true)
+	w.inBuf = w.inBuf[:0]
+	return w.err
+}
+
+func (w *Writer) Reset(newDest io.Writer) {
+	w.MatchFinder.Reset()
+	w.Encoder.Reset()
+	w.err = nil
+	w.inBuf = w.inBuf[:0]
+	w.outBuf = w.outBuf[:0]
+	w.matches = w.matches[:0]
+	w.Dest = newDest
+}
diff --git a/vendor/github.com/andybalholm/brotli/matchfinder/textencoder.go b/vendor/github.com/andybalholm/brotli/matchfinder/textencoder.go
new file mode 100644
index 0000000000..75ecc5908b
--- /dev/null
+++ b/vendor/github.com/andybalholm/brotli/matchfinder/textencoder.go
@@ -0,0 +1,53 @@
+package matchfinder
+
+import "fmt"
+
+// A TextEncoder is an Encoder that produces a human-readable representation of
+// the LZ77 compression. Matches are replaced with <Length,Distance> symbols.
+type TextEncoder struct{}
+
+func (t TextEncoder) Reset() {}
+
+func (t TextEncoder) Encode(dst []byte, src []byte, matches []Match, lastBlock bool) []byte {
+	pos := 0
+	for _, m := range matches {
+		if m.Unmatched > 0 {
+			dst = append(dst, src[pos:pos+m.Unmatched]...)
+			pos += m.Unmatched
+		}
+		if m.Length > 0 {
+			dst = append(dst, []byte(fmt.Sprintf("<%d,%d>", m.Length, m.Distance))...)
+			pos += m.Length
+		}
+	}
+	if pos < len(src) {
+		dst = append(dst, src[pos:]...)
+	}
+	return dst
+}
+
+// A NoMatchFinder implements MatchFinder, but doesn't find any matches.
+// It can be used to implement the equivalent of the standard library flate package's
+// HuffmanOnly setting.
+type NoMatchFinder struct{}
+
+func (n NoMatchFinder) Reset() {}
+
+func (n NoMatchFinder) FindMatches(dst []Match, src []byte) []Match {
+	return append(dst, Match{
+		Unmatched: len(src),
+	})
+}
+
+// AutoReset wraps a MatchFinder that can return references to data in previous
+// blocks, and calls Reset before each block. It is useful for (e.g.) using a
+// snappy Encoder with a MatchFinder designed for flate. (Snappy doesn't
+// support references between blocks.)
+type AutoReset struct {
+	MatchFinder
+}
+
+func (a AutoReset) FindMatches(dst []Match, src []byte) []Match {
+	a.Reset()
+	return a.MatchFinder.FindMatches(dst, src)
+}
diff --git a/vendor/github.com/andybalholm/brotli/reader.go b/vendor/github.com/andybalholm/brotli/reader.go
index 5c795e6e9e..3e22789350 100644
--- a/vendor/github.com/andybalholm/brotli/reader.go
+++ b/vendor/github.com/andybalholm/brotli/reader.go
@@ -27,13 +27,21 @@ func NewReader(src io.Reader) *Reader {
 }
 
 // Reset discards the Reader's state and makes it equivalent to the result of
-// its original state from NewReader, but writing to src instead.
+// its original state from NewReader, but reading from src instead.
 // This permits reusing a Reader rather than allocating a new one.
 // Error is always nil
 func (r *Reader) Reset(src io.Reader) error {
+	if r.error_code < 0 {
+		// There was an unrecoverable error, leaving the Reader's state
+		// undefined. Clear out everything but the buffer.
+		*r = Reader{buf: r.buf}
+	}
+
 	decoderStateInit(r)
 	r.src = src
-	r.buf = make([]byte, readBufSize)
+	if r.buf == nil {
+		r.buf = make([]byte, readBufSize)
+	}
 	return nil
 }
 
@@ -41,6 +49,9 @@ func (r *Reader) Read(p []byte) (n int, err error) {
 	if !decoderHasMoreOutput(r) && len(r.in) == 0 {
 		m, readErr := r.src.Read(r.buf)
 		if m == 0 {
+			if readErr == io.EOF && r.state != stateDone {
+				readErr = io.ErrUnexpectedEOF
+			}
 			// If readErr is `nil`, we just proxy underlying stream behavior.
 			return 0, readErr
 		}
diff --git a/vendor/github.com/andybalholm/brotli/state.go b/vendor/github.com/andybalholm/brotli/state.go
index d03348fe80..38d753ebe4 100644
--- a/vendor/github.com/andybalholm/brotli/state.go
+++ b/vendor/github.com/andybalholm/brotli/state.go
@@ -200,7 +200,6 @@ func decoderStateInit(s *Reader) bool {
 
 	s.block_type_trees = nil
 	s.block_len_trees = nil
-	s.ringbuffer = nil
 	s.ringbuffer_size = 0
 	s.new_ringbuffer_size = 0
 	s.ringbuffer_mask = 0
diff --git a/vendor/github.com/andybalholm/brotli/static_dict.go b/vendor/github.com/andybalholm/brotli/static_dict.go
index 8e7492d7ae..bc05566d6f 100644
--- a/vendor/github.com/andybalholm/brotli/static_dict.go
+++ b/vendor/github.com/andybalholm/brotli/static_dict.go
@@ -77,8 +77,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 		var offset uint = uint(dict.buckets[hash(data)])
 		var end bool = offset == 0
 		for !end {
-			var w dictWord
-			w = dict.dict_words[offset]
+			w := dict.dict_words[offset]
 			offset++
 			var l uint = uint(w.len) & 0x1F
 			var n uint = uint(1) << dict.words.size_bits_by_length[l]
@@ -431,8 +430,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 		var offset uint = uint(dict.buckets[hash(data[1:])])
 		var end bool = offset == 0
 		for !end {
-			var w dictWord
-			w = dict.dict_words[offset]
+			w := dict.dict_words[offset]
 			offset++
 			var l uint = uint(w.len) & 0x1F
 			var n uint = uint(1) << dict.words.size_bits_by_length[l]
@@ -596,8 +594,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 			var offset uint = uint(dict.buckets[hash(data[2:])])
 			var end bool = offset == 0
 			for !end {
-				var w dictWord
-				w = dict.dict_words[offset]
+				w := dict.dict_words[offset]
 				offset++
 				var l uint = uint(w.len) & 0x1F
 				var n uint = uint(1) << dict.words.size_bits_by_length[l]
@@ -629,8 +626,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 			var offset uint = uint(dict.buckets[hash(data[5:])])
 			var end bool = offset == 0
 			for !end {
-				var w dictWord
-				w = dict.dict_words[offset]
+				w := dict.dict_words[offset]
 				offset++
 				var l uint = uint(w.len) & 0x1F
 				var n uint = uint(1) << dict.words.size_bits_by_length[l]
diff --git a/vendor/github.com/andybalholm/brotli/utf8_util.go b/vendor/github.com/andybalholm/brotli/utf8_util.go
index f86de3d209..3244247eec 100644
--- a/vendor/github.com/andybalholm/brotli/utf8_util.go
+++ b/vendor/github.com/andybalholm/brotli/utf8_util.go
@@ -58,8 +58,7 @@ func isMostlyUTF8(data []byte, pos uint, mask uint, length uint, min_fraction fl
 	var i uint = 0
 	for i < length {
 		var symbol int
-		var current_data []byte
-		current_data = data[(pos+i)&mask:]
+		current_data := data[(pos+i)&mask:]
 		var bytes_read uint = parseAsUTF8(&symbol, current_data, length-i)
 		i += bytes_read
 		if symbol < 0x110000 {
diff --git a/vendor/github.com/andybalholm/brotli/write_bits.go b/vendor/github.com/andybalholm/brotli/write_bits.go
index 2d216d7ccd..8729901198 100644
--- a/vendor/github.com/andybalholm/brotli/write_bits.go
+++ b/vendor/github.com/andybalholm/brotli/write_bits.go
@@ -1,5 +1,7 @@
 package brotli
 
+import "encoding/binary"
+
 /* Copyright 2010 Google Inc. All Rights Reserved.
 
    Distributed under MIT license.
@@ -8,87 +10,43 @@ package brotli
 
 /* Write bits into a byte array. */
 
-type bitWriter struct {
-	dst []byte
-
-	// Data waiting to be written is the low nbits of bits.
-	bits  uint64
-	nbits uint
-}
+/* This function writes bits into bytes in increasing addresses, and within
+   a byte least-significant-bit first.
 
-func (w *bitWriter) writeBits(nb uint, b uint64) {
-	w.bits |= b << w.nbits
-	w.nbits += nb
-	if w.nbits >= 32 {
-		bits := w.bits
-		w.bits >>= 32
-		w.nbits -= 32
-		w.dst = append(w.dst,
-			byte(bits),
-			byte(bits>>8),
-			byte(bits>>16),
-			byte(bits>>24),
-		)
-	}
-}
+   The function can write up to 56 bits in one go with WriteBits
+   Example: let's assume that 3 bits (Rs below) have been written already:
 
-func (w *bitWriter) writeSingleBit(bit bool) {
-	if bit {
-		w.writeBits(1, 1)
-	} else {
-		w.writeBits(1, 0)
-	}
-}
+   BYTE-0     BYTE+1       BYTE+2
 
-func (w *bitWriter) jumpToByteBoundary() {
-	dst := w.dst
-	for w.nbits != 0 {
-		dst = append(dst, byte(w.bits))
-		w.bits >>= 8
-		if w.nbits > 8 { // Avoid underflow
-			w.nbits -= 8
-		} else {
-			w.nbits = 0
-		}
-	}
-	w.bits = 0
-	w.dst = dst
-}
+   0000 0RRR    0000 0000    0000 0000
 
-func (w *bitWriter) writeBytes(b []byte) {
-	if w.nbits&7 != 0 {
-		panic("writeBytes with unfinished bits")
-	}
-	for w.nbits != 0 {
-		w.dst = append(w.dst, byte(w.bits))
-		w.bits >>= 8
-		w.nbits -= 8
-	}
-	w.dst = append(w.dst, b...)
-}
+   Now, we could write 5 or less bits in MSB by just sifting by 3
+   and OR'ing to BYTE-0.
 
-func (w *bitWriter) getPos() uint {
-	return uint(len(w.dst)<<3) + w.nbits
+   For n bits, we take the last 5 bits, OR that with high bits in BYTE-0,
+   and locate the rest in BYTE+1, BYTE+2, etc. */
+func writeBits(n_bits uint, bits uint64, pos *uint, array []byte) {
+	/* This branch of the code can write up to 56 bits at a time,
+	   7 bits are lost by being perhaps already in *p and at least
+	   1 bit is needed to initialize the bit-stream ahead (i.e. if 7
+	   bits are in *p and we write 57 bits, then the next write will
+	   access a byte that was never initialized). */
+	p := array[*pos>>3:]
+	v := uint64(p[0])
+	v |= bits << (*pos & 7)
+	binary.LittleEndian.PutUint64(p, v)
+	*pos += n_bits
 }
 
-func (w *bitWriter) rewind(p uint) {
-	w.bits = uint64(w.dst[p>>3] & byte((1<<(p&7))-1))
-	w.nbits = p & 7
-	w.dst = w.dst[:p>>3]
+func writeSingleBit(bit bool, pos *uint, array []byte) {
+	if bit {
+		writeBits(1, 1, pos, array)
+	} else {
+		writeBits(1, 0, pos, array)
+	}
 }
 
-func (w *bitWriter) updateBits(n_bits uint, bits uint32, pos uint) {
-	for n_bits > 0 {
-		var byte_pos uint = pos >> 3
-		var n_unchanged_bits uint = pos & 7
-		var n_changed_bits uint = brotli_min_size_t(n_bits, 8-n_unchanged_bits)
-		var total_bits uint = n_unchanged_bits + n_changed_bits
-		var mask uint32 = (^((1 << total_bits) - 1)) | ((1 << n_unchanged_bits) - 1)
-		var unchanged_bits uint32 = uint32(w.dst[byte_pos]) & mask
-		var changed_bits uint32 = bits & ((1 << n_changed_bits) - 1)
-		w.dst[byte_pos] = byte(changed_bits<<n_unchanged_bits | unchanged_bits)
-		n_bits -= n_changed_bits
-		bits >>= n_changed_bits
-		pos += n_changed_bits
-	}
+func writeBitsPrepareStorage(pos uint, array []byte) {
+	assert(pos&7 == 0)
+	array[pos>>3] = 0
 }
diff --git a/vendor/github.com/andybalholm/brotli/writer.go b/vendor/github.com/andybalholm/brotli/writer.go
index 63676b4673..8a688117d1 100644
--- a/vendor/github.com/andybalholm/brotli/writer.go
+++ b/vendor/github.com/andybalholm/brotli/writer.go
@@ -3,6 +3,8 @@ package brotli
 import (
 	"errors"
 	"io"
+
+	"github.com/andybalholm/brotli/matchfinder"
 )
 
 const (
@@ -61,6 +63,7 @@ func (w *Writer) Reset(dst io.Writer) {
 		w.params.lgwin = uint(w.options.LGWin)
 	}
 	w.dst = dst
+	w.err = nil
 }
 
 func (w *Writer) writeChunk(p []byte, op int) (n int, err error) {
@@ -116,3 +119,44 @@ type nopCloser struct {
 }
 
 func (nopCloser) Close() error { return nil }
+
+// NewWriterV2 is like NewWriterLevel, but it uses the new implementation
+// based on the matchfinder package. It currently supports up to level 7;
+// if a higher level is specified, level 7 will be used.
+func NewWriterV2(dst io.Writer, level int) *matchfinder.Writer {
+	var mf matchfinder.MatchFinder
+	if level < 2 {
+		mf = matchfinder.M0{Lazy: level == 1}
+	} else {
+		hashLen := 6
+		if level >= 6 {
+			hashLen = 5
+		}
+		chainLen := 64
+		switch level {
+		case 2:
+			chainLen = 0
+		case 3:
+			chainLen = 1
+		case 4:
+			chainLen = 2
+		case 5:
+			chainLen = 4
+		case 6:
+			chainLen = 8
+		}
+		mf = &matchfinder.M4{
+			MaxDistance:     1 << 20,
+			ChainLength:     chainLen,
+			HashLen:         hashLen,
+			DistanceBitCost: 57,
+		}
+	}
+
+	return &matchfinder.Writer{
+		Dest:        dst,
+		MatchFinder: mf,
+		Encoder:     &Encoder{},
+		BlockSize:   1 << 16,
+	}
+}
diff --git a/vendor/github.com/bodgit/plumbing/.golangci.yaml b/vendor/github.com/bodgit/plumbing/.golangci.yaml
new file mode 100644
index 0000000000..94477c85f4
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/.golangci.yaml
@@ -0,0 +1,10 @@
+---
+linters:
+  enable-all: true
+  disable:
+    - dupword
+    - exhaustivestruct
+    - exhaustruct
+    - nonamedreturns
+    - varnamelen
+    - wrapcheck
diff --git a/vendor/github.com/bodgit/plumbing/.goreleaser.yml b/vendor/github.com/bodgit/plumbing/.goreleaser.yml
new file mode 100644
index 0000000000..75e2a1f7e0
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/.goreleaser.yml
@@ -0,0 +1,7 @@
+---
+builds:
+  - skip: true
+release:
+  prerelease: auto
+changelog:
+  use: github-native
diff --git a/vendor/github.com/bodgit/plumbing/LICENSE b/vendor/github.com/bodgit/plumbing/LICENSE
new file mode 100644
index 0000000000..5df2ec521e
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/LICENSE
@@ -0,0 +1,30 @@
+BSD 3-Clause License
+
+Copyright (c) 2019, Matt Dainty
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/vendor/github.com/bodgit/plumbing/README.md b/vendor/github.com/bodgit/plumbing/README.md
new file mode 100644
index 0000000000..79d6ec6a22
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/README.md
@@ -0,0 +1,11 @@
+[![Build Status](https://img.shields.io/github/workflow/status/bodgit/plumbing/build)](https://github.com/bodgit/plumbing/actions?query=workflow%3Abuild)
+[![Coverage Status](https://coveralls.io/repos/github/bodgit/plumbing/badge.svg?branch=master)](https://coveralls.io/github/bodgit/plumbing?branch=master)
+[![Go Report Card](https://goreportcard.com/badge/github.com/bodgit/plumbing)](https://goreportcard.com/report/github.com/bodgit/plumbing)
+[![GoDoc](https://godoc.org/github.com/bodgit/plumbing?status.svg)](https://godoc.org/github.com/bodgit/plumbing)
+![Go version](https://img.shields.io/badge/Go-1.19-brightgreen.svg)
+![Go version](https://img.shields.io/badge/Go-1.18-brightgreen.svg)
+
+plumbing
+========
+
+Assorted I/O U-bends, T-pieces, etc.
diff --git a/vendor/github.com/bodgit/plumbing/count.go b/vendor/github.com/bodgit/plumbing/count.go
new file mode 100644
index 0000000000..a2adcf0f03
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/count.go
@@ -0,0 +1,23 @@
+package plumbing
+
+import (
+	"sync/atomic"
+)
+
+// WriteCounter is an io.Writer that simply counts the number of bytes written
+// to it.
+type WriteCounter struct {
+	count uint64
+}
+
+func (wc *WriteCounter) Write(p []byte) (int, error) {
+	n := len(p)
+	atomic.AddUint64(&wc.count, uint64(n))
+
+	return n, nil
+}
+
+// Count returns the number of bytes written.
+func (wc *WriteCounter) Count() uint64 {
+	return atomic.LoadUint64(&wc.count)
+}
diff --git a/vendor/github.com/bodgit/plumbing/fill.go b/vendor/github.com/bodgit/plumbing/fill.go
new file mode 100644
index 0000000000..353d6a607c
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/fill.go
@@ -0,0 +1,21 @@
+package plumbing
+
+import "io"
+
+type fillReader struct {
+	b byte
+}
+
+func (r *fillReader) Read(p []byte) (int, error) {
+	for i := range p {
+		p[i] = r.b
+	}
+
+	return len(p), nil
+}
+
+// FillReader returns an io.Reader such that Read calls return an unlimited
+// stream of b bytes.
+func FillReader(b byte) io.Reader {
+	return &fillReader{b}
+}
diff --git a/vendor/github.com/bodgit/plumbing/limit.go b/vendor/github.com/bodgit/plumbing/limit.go
new file mode 100644
index 0000000000..5dbba039c8
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/limit.go
@@ -0,0 +1,39 @@
+package plumbing
+
+import "io"
+
+// A LimitedReadCloser reads from R but limits the amount of
+// data returned to just N bytes. Each call to Read
+// updates N to reflect the new amount remaining.
+// Read returns EOF when N <= 0 or when the underlying R returns EOF.
+type LimitedReadCloser struct {
+	R io.ReadCloser
+	N int64
+}
+
+func (l *LimitedReadCloser) Read(p []byte) (n int, err error) {
+	if l.N <= 0 {
+		return 0, io.EOF
+	}
+
+	if int64(len(p)) > l.N {
+		p = p[0:l.N]
+	}
+
+	n, err = l.R.Read(p)
+	l.N -= int64(n)
+
+	return
+}
+
+// Close closes the LimitedReadCloser, rendering it unusable for I/O.
+func (l *LimitedReadCloser) Close() error {
+	return l.R.Close()
+}
+
+// LimitReadCloser returns an io.ReadCloser that reads from r
+// but stops with EOF after n bytes.
+// The underlying implementation is a *LimitedReadCloser.
+func LimitReadCloser(r io.ReadCloser, n int64) io.ReadCloser {
+	return &LimitedReadCloser{r, n}
+}
diff --git a/vendor/github.com/bodgit/plumbing/multi.go b/vendor/github.com/bodgit/plumbing/multi.go
new file mode 100644
index 0000000000..6a1bb7a91f
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/multi.go
@@ -0,0 +1,111 @@
+package plumbing
+
+import (
+	"io"
+)
+
+type multiWriteCloser struct {
+	writeClosers []io.WriteCloser
+}
+
+func (t *multiWriteCloser) Write(p []byte) (n int, err error) {
+	for _, wc := range t.writeClosers {
+		n, err = wc.Write(p)
+		if err != nil {
+			return
+		}
+
+		if n != len(p) {
+			err = io.ErrShortWrite
+
+			return
+		}
+	}
+
+	return len(p), nil
+}
+
+func (t *multiWriteCloser) Close() (err error) {
+	for _, wc := range t.writeClosers {
+		err = wc.Close()
+		if err != nil {
+			return
+		}
+	}
+
+	return
+}
+
+// MultiWriteCloser creates a writer that duplicates its writes to all the
+// provided writers, similar to the Unix tee(1) command.
+//
+// Each write is written to each listed writer, one at a time.
+// If a listed writer returns an error, that overall write operation
+// stops and returns the error; it does not continue down the list.
+func MultiWriteCloser(writeClosers ...io.WriteCloser) io.WriteCloser {
+	allWriteClosers := make([]io.WriteCloser, 0, len(writeClosers))
+
+	for _, wc := range writeClosers {
+		if mwc, ok := wc.(*multiWriteCloser); ok {
+			allWriteClosers = append(allWriteClosers, mwc.writeClosers...)
+		} else {
+			allWriteClosers = append(allWriteClosers, wc)
+		}
+	}
+
+	return &multiWriteCloser{allWriteClosers}
+}
+
+type multiReadCloser struct {
+	readClosers []io.ReadCloser
+	i           int
+}
+
+func (mrc *multiReadCloser) Read(p []byte) (n int, err error) {
+	for mrc.i < len(mrc.readClosers) {
+		if len(mrc.readClosers) == 1 {
+			if rc, ok := mrc.readClosers[0].(*multiReadCloser); ok {
+				mrc.readClosers = rc.readClosers
+
+				continue
+			}
+		}
+
+		n, err = mrc.readClosers[mrc.i].Read(p)
+		if err == io.EOF { //nolint:errorlint
+			mrc.i++
+		}
+
+		if n > 0 || err != io.EOF { //nolint:errorlint
+			if err == io.EOF && mrc.i < len(mrc.readClosers) { //nolint:errorlint
+				err = nil
+			}
+
+			return
+		}
+	}
+
+	return 0, io.EOF
+}
+
+func (mrc *multiReadCloser) Close() (err error) {
+	for _, rc := range mrc.readClosers {
+		err = rc.Close()
+		if err != nil {
+			return
+		}
+	}
+
+	return
+}
+
+// MultiReadCloser returns an io.ReadCloser that's the logical concatenation
+// of the provider input readers. They're read sequentially. Once all inputs
+// have returned io.EOF, Read will return EOF. If any of the readers return
+// a non-nil, non-EOF error, Read will return that error.
+func MultiReadCloser(readClosers ...io.ReadCloser) io.ReadCloser {
+	rc := make([]io.ReadCloser, len(readClosers))
+	copy(rc, readClosers)
+
+	return &multiReadCloser{rc, 0}
+}
diff --git a/vendor/github.com/bodgit/plumbing/padded.go b/vendor/github.com/bodgit/plumbing/padded.go
new file mode 100644
index 0000000000..675c85d6e3
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/padded.go
@@ -0,0 +1,12 @@
+package plumbing
+
+import (
+	"io"
+)
+
+// PaddedReader returns an io.Reader that reads at most n bytes from r. If
+// fewer than n bytes are available from r then any remaining bytes return
+// fill instead.
+func PaddedReader(r io.Reader, n int64, fill byte) io.Reader {
+	return io.LimitReader(io.MultiReader(r, FillReader(fill)), n)
+}
diff --git a/vendor/github.com/bodgit/plumbing/plumbing.go b/vendor/github.com/bodgit/plumbing/plumbing.go
new file mode 100644
index 0000000000..58d8f74d1f
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/plumbing.go
@@ -0,0 +1,18 @@
+// Package plumbing is a collection of assorted I/O helpers.
+package plumbing
+
+import "io"
+
+type nopWriteCloser struct {
+	io.Writer
+}
+
+func (nopWriteCloser) Close() error {
+	return nil
+}
+
+// NopWriteCloser returns an io.WriteCloser with a no-op Close method
+// wrapping the provided io.Writer w.
+func NopWriteCloser(w io.Writer) io.WriteCloser {
+	return nopWriteCloser{w}
+}
diff --git a/vendor/github.com/bodgit/plumbing/tee.go b/vendor/github.com/bodgit/plumbing/tee.go
new file mode 100644
index 0000000000..1f42423fb7
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/tee.go
@@ -0,0 +1,57 @@
+package plumbing
+
+import "io"
+
+type teeReaderAt struct {
+	r io.ReaderAt
+	w io.Writer
+}
+
+func (t *teeReaderAt) ReadAt(p []byte, off int64) (n int, err error) {
+	n, err = t.r.ReadAt(p, off)
+	if n > 0 {
+		if n, err := t.w.Write(p[:n]); err != nil {
+			return n, err
+		}
+	}
+
+	return
+}
+
+// TeeReaderAt returns an io.ReaderAt that writes to w what it reads from r.
+// All reads from r performed through it are matched with corresponding writes
+// to w.  There is no internal buffering - the write must complete before the
+// read completes. Any error encountered while writing is reported as a read
+// error.
+func TeeReaderAt(r io.ReaderAt, w io.Writer) io.ReaderAt {
+	return &teeReaderAt{r, w}
+}
+
+type teeReadCloser struct {
+	r io.ReadCloser
+	w io.Writer
+}
+
+func (t *teeReadCloser) Read(p []byte) (n int, err error) {
+	n, err = t.r.Read(p)
+	if n > 0 {
+		if n, err := t.w.Write(p[:n]); err != nil {
+			return n, err
+		}
+	}
+
+	return
+}
+
+func (t *teeReadCloser) Close() error {
+	return t.r.Close()
+}
+
+// TeeReadCloser returns an io.ReadCloser that writes to w what it reads from
+// r. All reads from r performed through it are matched with corresponding
+// writes to w. There is no internal buffering - the write must complete
+// before the read completes. Any error encountered while writing is reported
+// as a read error.
+func TeeReadCloser(r io.ReadCloser, w io.Writer) io.ReadCloser {
+	return &teeReadCloser{r, w}
+}
diff --git a/vendor/github.com/bodgit/plumbing/zero.go b/vendor/github.com/bodgit/plumbing/zero.go
new file mode 100644
index 0000000000..943a35d1d5
--- /dev/null
+++ b/vendor/github.com/bodgit/plumbing/zero.go
@@ -0,0 +1,18 @@
+package plumbing
+
+import "io"
+
+type devZero struct {
+	io.Reader
+}
+
+func (w *devZero) Write(p []byte) (int, error) {
+	return len(p), nil
+}
+
+// DevZero returns an io.ReadWriter that behaves like /dev/zero such that Read
+// calls return an unlimited stream of zero bytes and all Write calls succeed
+// without doing anything.
+func DevZero() io.ReadWriter {
+	return &devZero{FillReader(0)}
+}
diff --git a/vendor/github.com/bodgit/sevenzip/.golangci.yaml b/vendor/github.com/bodgit/sevenzip/.golangci.yaml
new file mode 100644
index 0000000000..a13515e79c
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/.golangci.yaml
@@ -0,0 +1,105 @@
+---
+issues:
+  exclude-use-default: false
+linters:
+  disable-all: true
+  enable:
+    - asasalint
+    - asciicheck
+    - bidichk
+    - bodyclose
+    - canonicalheader
+    - containedctx
+    - contextcheck
+    - copyloopvar
+    - cyclop
+    - decorder
+    - dogsled
+    - dupl
+    - dupword
+    - durationcheck
+    - err113
+    - errcheck
+    - errchkjson
+    - errname
+    - errorlint
+    - exhaustive
+    - fatcontext
+    - forbidigo
+    - forcetypeassert
+    - funlen
+    - gci
+    - ginkgolinter
+    - gocheckcompilerdirectives
+    - gochecknoglobals
+    - gochecknoinits
+    - gochecksumtype
+    - gocognit
+    - goconst
+    - gocritic
+    - gocyclo
+    - godot
+    - gofmt
+    - gofumpt
+    - goheader
+    - goimports
+    - gomoddirectives
+    - gomodguard
+    - goprintffuncname
+    - gosec
+    - gosimple
+    - gosmopolitan
+    - govet
+    - grouper
+    - importas
+    - inamedparam
+    - ineffassign
+    - interfacebloat
+    - intrange
+    - lll
+    - loggercheck
+    - maintidx
+    - makezero
+    - mirror
+    - misspell
+    - musttag
+    - nakedret
+    - nestif
+    - nilerr
+    - nilnil
+    - nlreturn
+    - noctx
+    - nolintlint
+    - nosprintfhostport
+    - paralleltest
+    - perfsprint
+    - prealloc
+    - predeclared
+    - promlinter
+    - protogetter
+    - reassign
+    - revive
+    - rowserrcheck
+    - sloglint
+    - spancheck
+    - sqlclosecheck
+    - staticcheck
+    - stylecheck
+    - tagalign
+    - tagliatelle
+    - tenv
+    - testableexamples
+    - testifylint
+    - testpackage
+    - thelper
+    - tparallel
+    - typecheck
+    - unconvert
+    - unparam
+    - unused
+    - usestdlibvars
+    - wastedassign
+    - whitespace
+    - wrapcheck
+    - wsl
+    - zerologlint
diff --git a/vendor/github.com/bodgit/sevenzip/.pre-commit-config.yaml b/vendor/github.com/bodgit/sevenzip/.pre-commit-config.yaml
new file mode 100644
index 0000000000..690eec1e61
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/commitizen-tools/commitizen
+    rev: v3.5.3
+    hooks:
+      - id: commitizen
+  - repo: https://github.com/golangci/golangci-lint
+    rev: v1.61.0
+    hooks:
+      - id: golangci-lint
+  - repo: https://github.com/gitleaks/gitleaks
+    rev: v8.18.0
+    hooks:
+      - id: gitleaks
diff --git a/vendor/github.com/bodgit/sevenzip/.release-please-manifest.json b/vendor/github.com/bodgit/sevenzip/.release-please-manifest.json
new file mode 100644
index 0000000000..0d1bebe1cf
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/.release-please-manifest.json
@@ -0,0 +1,3 @@
+{
+  ".": "1.6.0"
+}
diff --git a/vendor/github.com/bodgit/sevenzip/CHANGELOG.md b/vendor/github.com/bodgit/sevenzip/CHANGELOG.md
new file mode 100644
index 0000000000..b8b58f522a
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/CHANGELOG.md
@@ -0,0 +1,46 @@
+# Changelog
+
+## [1.6.0](https://github.com/bodgit/sevenzip/compare/v1.5.2...v1.6.0) (2024-11-17)
+
+
+### Features
+
+* Add ReadError to wrap I/O errors ([#278](https://github.com/bodgit/sevenzip/issues/278)) ([d38d0aa](https://github.com/bodgit/sevenzip/commit/d38d0aaf74e642d9004b8fee09ab93befeffd174))
+
+## [1.5.2](https://github.com/bodgit/sevenzip/compare/v1.5.1...v1.5.2) (2024-08-29)
+
+
+### Bug Fixes
+
+* Avoid panic in Reader init (empty2.7z); header.filesInfo is nil. ([#252](https://github.com/bodgit/sevenzip/issues/252)) ([10d7550](https://github.com/bodgit/sevenzip/commit/10d75506fa01719e9e0f074c4e7b3c3b96f4233d))
+* Lint fixes ([#253](https://github.com/bodgit/sevenzip/issues/253)) ([c82d2e9](https://github.com/bodgit/sevenzip/commit/c82d2e90e52ae81797b0f790fabe90baf35bf581))
+
+## [1.5.1](https://github.com/bodgit/sevenzip/compare/v1.5.0...v1.5.1) (2024-04-05)
+
+
+### Performance Improvements
+
+* Add AES key caching ([#189](https://github.com/bodgit/sevenzip/issues/189)) ([3d794c2](https://github.com/bodgit/sevenzip/commit/3d794c26c683fe80def4496d49106679b868ae2e))
+* Don't use pools for streams with one file ([#194](https://github.com/bodgit/sevenzip/issues/194)) ([b4cfdcf](https://github.com/bodgit/sevenzip/commit/b4cfdcfe0a64380d64c112d41a870dc8c33c1274))
+
+## [1.5.0](https://github.com/bodgit/sevenzip/compare/v1.4.5...v1.5.0) (2024-02-08)
+
+
+### Features
+
+* Export the folder/stream identifier ([#169](https://github.com/bodgit/sevenzip/issues/169)) ([187a49e](https://github.com/bodgit/sevenzip/commit/187a49e243ec0618b527851fcee0503d8436e7c2))
+
+## [1.4.5](https://github.com/bodgit/sevenzip/compare/v1.4.4...v1.4.5) (2023-12-12)
+
+
+### Bug Fixes
+
+* Handle lack of CRC digests ([#143](https://github.com/bodgit/sevenzip/issues/143)) ([4ead944](https://github.com/bodgit/sevenzip/commit/4ead944ad71398931b70a09ea40ba9ce742f4bf7))
+* Handle small reads in branch converters ([#144](https://github.com/bodgit/sevenzip/issues/144)) ([dfaf538](https://github.com/bodgit/sevenzip/commit/dfaf538402be45e6cd12064b3d49e7496d2b22f4))
+
+## [1.4.4](https://github.com/bodgit/sevenzip/compare/v1.4.3...v1.4.4) (2023-11-06)
+
+
+### Bug Fixes
+
+* Handle panic when unpack info is missing ([#117](https://github.com/bodgit/sevenzip/issues/117)) ([db3ba77](https://github.com/bodgit/sevenzip/commit/db3ba775286aa4efce8fdd1c398bf2bd4dfba37d))
diff --git a/vendor/github.com/bodgit/sevenzip/LICENSE b/vendor/github.com/bodgit/sevenzip/LICENSE
new file mode 100644
index 0000000000..5a19a8fc6f
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2020, Matt Dainty
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/bodgit/sevenzip/README.md b/vendor/github.com/bodgit/sevenzip/README.md
new file mode 100644
index 0000000000..a1e7f02af7
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/README.md
@@ -0,0 +1,145 @@
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/bodgit/sevenzip/badge)](https://securityscorecards.dev/viewer/?uri=github.com/bodgit/sevenzip)
+[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/6882/badge)](https://www.bestpractices.dev/projects/6882)
+[![GitHub release](https://img.shields.io/github/v/release/bodgit/sevenzip)](https://github.com/bodgit/sevenzip/releases)
+[![Build Status](https://img.shields.io/github/actions/workflow/status/bodgit/sevenzip/build.yml?branch=main)](https://github.com/bodgit/sevenzip/actions?query=workflow%3ABuild)
+[![Coverage Status](https://coveralls.io/repos/github/bodgit/sevenzip/badge.svg?branch=master)](https://coveralls.io/github/bodgit/sevenzip?branch=master)
+[![Go Report Card](https://goreportcard.com/badge/github.com/bodgit/sevenzip)](https://goreportcard.com/report/github.com/bodgit/sevenzip)
+[![GoDoc](https://godoc.org/github.com/bodgit/sevenzip?status.svg)](https://godoc.org/github.com/bodgit/sevenzip)
+![Go version](https://img.shields.io/badge/Go-1.22-brightgreen.svg)
+![Go version](https://img.shields.io/badge/Go-1.21-brightgreen.svg)
+
+# sevenzip
+
+A reader for 7-zip archives inspired by `archive/zip`.
+
+Current status:
+
+* Pure Go, no external libraries or binaries needed.
+* Handles uncompressed headers, (`7za a -mhc=off test.7z ...`).
+* Handles compressed headers, (`7za a -mhc=on test.7z ...`).
+* Handles password-protected versions of both of the above (`7za a -mhc=on|off -mhe=on -ppassword test.7z ...`).
+* Handles archives split into multiple volumes, (`7za a -v100m test.7z ...`).
+* Handles self-extracting archives, (`7za a -sfx archive.exe ...`).
+* Validates CRC values as it parses the file.
+* Supports ARM, BCJ, BCJ2, Brotli, Bzip2, Copy, Deflate, Delta, LZ4, LZMA, LZMA2, PPC, SPARC and Zstandard methods.
+* Implements the `fs.FS` interface so you can treat an opened 7-zip archive like a filesystem.
+
+More examples of 7-zip archives are needed to test all of the different combinations/algorithms possible.
+
+## Frequently Asked Questions
+
+### Why is my code running so slow?
+
+Someone might write the following simple code:
+```golang
+func extractArchive(archive string) error {
+        r, err := sevenzip.OpenReader(archive)
+        if err != nil {
+                return err
+        }
+        defer r.Close()
+
+        for _, f := range r.File {
+                rc, err := f.Open()
+                if err != nil {
+                        return err
+                }
+                defer rc.Close()
+
+                // Extract the file
+        }
+
+        return nil
+}
+```
+Unlike a zip archive where every file is individually compressed, 7-zip archives can have all of the files compressed together in one long compressed stream, supposedly to achieve a better compression ratio.
+In a naive random access implementation, to read the first file you start at the beginning of the compressed stream and read out that files worth of bytes.
+To read the second file you have to start at the beginning of the compressed stream again, read and discard the first files worth of bytes to get to the correct offset in the stream, then read out the second files worth of bytes.
+You can see that for an archive that contains hundreds of files, extraction can get progressively slower as you have to read and discard more and more data just to get to the right offset in the stream.
+
+This package contains an optimisation that caches and reuses the underlying compressed stream reader so you don't have to keep starting from the beginning for each file, but it does require you to call `rc.Close()` before extracting the next file.
+So write your code similar to this:
+```golang
+func extractFile(file *sevenzip.File) error {
+        rc, err := f.Open()
+        if err != nil {
+                return err
+        }
+        defer rc.Close()
+
+        // Extract the file
+
+        return nil
+}
+
+func extractArchive(archive string) error {
+        r, err := sevenzip.OpenReader(archive)
+        if err != nil {
+                return err
+        }
+        defer r.Close()
+
+        for _, f := range r.File {
+                if err = extractFile(f); err != nil {
+                        return err
+                }
+        }
+
+        return nil
+}
+```
+You can see the main difference is to not defer all of the `Close()` calls until the end of `extractArchive()`.
+
+There is a set of benchmarks in this package that demonstrates the performance boost that the optimisation provides, amongst other techniques:
+```
+$ go test -v -run='^$' -bench='Reader$' -benchtime=60s
+goos: darwin
+goarch: amd64
+pkg: github.com/bodgit/sevenzip
+cpu: Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz
+BenchmarkNaiveReader
+BenchmarkNaiveReader-12                  	       2	31077542628 ns/op
+BenchmarkOptimisedReader
+BenchmarkOptimisedReader-12              	     434	 164854747 ns/op
+BenchmarkNaiveParallelReader
+BenchmarkNaiveParallelReader-12          	     240	 361869339 ns/op
+BenchmarkNaiveSingleParallelReader
+BenchmarkNaiveSingleParallelReader-12    	     412	 171027895 ns/op
+BenchmarkParallelReader
+BenchmarkParallelReader-12               	     636	 112551812 ns/op
+PASS
+ok  	github.com/bodgit/sevenzip	472.251s
+```
+The archive used here is just the reference LZMA SDK archive, which is only 1 MiB in size but does contain 630+ files split across three compression streams.
+The only difference between BenchmarkNaiveReader and the rest is the lack of a call to `rc.Close()` between files so the stream reuse optimisation doesn't take effect.
+
+Don't try and blindly throw goroutines at the problem either as this can also undo the optimisation; a naive implementation that uses a pool of multiple goroutines to extract each file ends up being nearly 50% slower, even just using a pool of one goroutine can end up being less efficient.
+The optimal way to employ goroutines is to make use of the `sevenzip.FileHeader.Stream` field; extract files with the same value using the same goroutine.
+This achieves a 50% speed improvement with the LZMA SDK archive, but it very much depends on how many streams there are in the archive.
+
+In general, don't try and extract the files in a different order compared to the natural order within the archive as that will also undo the optimisation.
+The worst scenario would likely be to extract the archive in reverse order.
+
+### Detecting the wrong password
+
+It's virtually impossible to _reliably_ detect the wrong password versus some other corruption in a password protected archive.
+This is partly due to how CBC decryption works; with the wrong password you don't get any sort of decryption error, you just a stream of bytes that aren't the correct ones.
+This manifests itself when the file has been compressed _and_ encrypted; during extraction the file is decrypted and then decompressed so with the wrong password the decompression algorithm gets handed a stream which isn't valid so that's the error you see.
+
+A `sevenzip.ReadError` error type can be returned for certain operations.
+If `sevenzip.ReadError.Encrypted` is `true` then encryption is involved and you can use that as a **hint** to either set a password or try a different one.
+Use `errors.As()` to check like this:
+```golang
+r, err := sevenzip.OpenReaderWithPassword(archive, password)
+if err != nil {
+        var e *sevenzip.ReadError
+        if errors.As(err, &e) && e.Encrypted {
+                // Encryption involved, retry with a different password
+        }
+
+        return err
+}
+```
+Be aware that if the archive does not have the headers encrypted, (`7za a -mhe=off -ppassword test.7z ...`), then you can always open the archive and the password is only used when extracting the files.
+
+If files are added to the archive encrypted and _not_ compressed, (`7za a -m0=copy -ppassword test.7z ...`), then you will never get an error extracting with the wrong password as the only consumer of the decrypted content will be your own code. To detect a potentially wrong password, calculate the CRC value and check that it matches the value in `sevenzip.FileHeader.CRC32`.
diff --git a/vendor/github.com/bodgit/sevenzip/internal/aes7z/key.go b/vendor/github.com/bodgit/sevenzip/internal/aes7z/key.go
new file mode 100644
index 0000000000..04532aa201
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/aes7z/key.go
@@ -0,0 +1,73 @@
+package aes7z
+
+import (
+	"bytes"
+	"crypto/sha256"
+	"encoding/binary"
+	"encoding/hex"
+	"fmt"
+
+	lru "github.com/hashicorp/golang-lru/v2"
+	"go4.org/syncutil"
+	"golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/transform"
+)
+
+type cacheKey struct {
+	password string
+	cycles   int
+	salt     string // []byte isn't comparable
+}
+
+const cacheSize = 10
+
+//nolint:gochecknoglobals
+var (
+	once  syncutil.Once
+	cache *lru.Cache[cacheKey, []byte]
+)
+
+func calculateKey(password string, cycles int, salt []byte) ([]byte, error) {
+	if err := once.Do(func() (err error) {
+		cache, err = lru.New[cacheKey, []byte](cacheSize)
+
+		return
+	}); err != nil {
+		return nil, fmt.Errorf("aes7z: error creating cache: %w", err)
+	}
+
+	ck := cacheKey{
+		password: password,
+		cycles:   cycles,
+		salt:     hex.EncodeToString(salt),
+	}
+
+	if key, ok := cache.Get(ck); ok {
+		return key, nil
+	}
+
+	b := bytes.NewBuffer(salt)
+
+	// Convert password to UTF-16LE
+	utf16le := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
+	t := transform.NewWriter(b, utf16le.NewEncoder())
+	_, _ = t.Write([]byte(password))
+
+	key := make([]byte, sha256.Size)
+	if cycles == 0x3f {
+		copy(key, b.Bytes())
+	} else {
+		h := sha256.New()
+		for i := uint64(0); i < 1<<cycles; i++ {
+			// These will never error
+			_, _ = h.Write(b.Bytes())
+			_ = binary.Write(h, binary.LittleEndian, i)
+		}
+
+		copy(key, h.Sum(nil))
+	}
+
+	_ = cache.Add(ck, key)
+
+	return key, nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/aes7z/reader.go b/vendor/github.com/bodgit/sevenzip/internal/aes7z/reader.go
new file mode 100644
index 0000000000..cbd5cff3b8
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/aes7z/reader.go
@@ -0,0 +1,126 @@
+// Package aes7z implements the 7-zip AES decryption.
+package aes7z
+
+import (
+	"bytes"
+	"crypto/aes"
+	"crypto/cipher"
+	"errors"
+	"fmt"
+	"io"
+)
+
+var (
+	errAlreadyClosed          = errors.New("aes7z: already closed")
+	errNeedOneReader          = errors.New("aes7z: need exactly one reader")
+	errInsufficientProperties = errors.New("aes7z: not enough properties")
+	errNoPasswordSet          = errors.New("aes7z: no password set")
+	errUnsupportedMethod      = errors.New("aes7z: unsupported compression method")
+)
+
+type readCloser struct {
+	rc       io.ReadCloser
+	salt, iv []byte
+	cycles   int
+	cbc      cipher.BlockMode
+	buf      bytes.Buffer
+}
+
+func (rc *readCloser) Close() error {
+	if rc.rc == nil {
+		return errAlreadyClosed
+	}
+
+	if err := rc.rc.Close(); err != nil {
+		return fmt.Errorf("aes7z: error closing: %w", err)
+	}
+
+	rc.rc = nil
+
+	return nil
+}
+
+func (rc *readCloser) Password(p string) error {
+	key, err := calculateKey(p, rc.cycles, rc.salt)
+	if err != nil {
+		return err
+	}
+
+	block, err := aes.NewCipher(key)
+	if err != nil {
+		return fmt.Errorf("aes7z: error creating cipher: %w", err)
+	}
+
+	rc.cbc = cipher.NewCBCDecrypter(block, rc.iv)
+
+	return nil
+}
+
+func (rc *readCloser) Read(p []byte) (int, error) {
+	if rc.rc == nil {
+		return 0, errAlreadyClosed
+	}
+
+	if rc.cbc == nil {
+		return 0, errNoPasswordSet
+	}
+
+	var block [aes.BlockSize]byte
+
+	for rc.buf.Len() < len(p) {
+		if _, err := io.ReadFull(rc.rc, block[:]); err != nil {
+			if errors.Is(err, io.EOF) {
+				break
+			}
+
+			return 0, fmt.Errorf("aes7z: error reading block: %w", err)
+		}
+
+		rc.cbc.CryptBlocks(block[:], block[:])
+
+		_, _ = rc.buf.Write(block[:])
+	}
+
+	n, err := rc.buf.Read(p)
+	if err != nil && !errors.Is(err, io.EOF) {
+		err = fmt.Errorf("aes7z: error reading: %w", err)
+	}
+
+	return n, err
+}
+
+// NewReader returns a new AES-256-CBC & SHA-256 io.ReadCloser. The Password
+// method must be called before attempting to call Read so that the block
+// cipher is correctly initialised.
+func NewReader(p []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	if len(readers) != 1 {
+		return nil, errNeedOneReader
+	}
+
+	// Need at least two bytes initially
+	if len(p) < 2 {
+		return nil, errInsufficientProperties
+	}
+
+	if p[0]&0xc0 == 0 {
+		return nil, errUnsupportedMethod
+	}
+
+	rc := new(readCloser)
+
+	salt := p[0]>>7&1 + p[1]>>4
+	iv := p[0]>>6&1 + p[1]&0x0f
+
+	if len(p) != int(2+salt+iv) {
+		return nil, errInsufficientProperties
+	}
+
+	rc.salt = p[2 : 2+salt]
+	rc.iv = make([]byte, aes.BlockSize)
+	copy(rc.iv, p[2+salt:])
+
+	rc.cycles = int(p[0] & 0x3f)
+	rc.rc = readers[0]
+
+	return rc, nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/bcj2/reader.go b/vendor/github.com/bodgit/sevenzip/internal/bcj2/reader.go
new file mode 100644
index 0000000000..957ea239a6
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/bcj2/reader.go
@@ -0,0 +1,234 @@
+// Package bcj2 implements the BCJ2 filter for x86 binaries.
+package bcj2
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+
+	"github.com/bodgit/sevenzip/internal/util"
+	"github.com/hashicorp/go-multierror"
+)
+
+type readCloser struct {
+	main util.ReadCloser
+	call io.ReadCloser
+	jump io.ReadCloser
+
+	rd     util.ReadCloser
+	nrange uint
+	code   uint
+
+	sd [256 + 2]uint
+
+	previous byte
+	written  uint32
+
+	buf *bytes.Buffer
+}
+
+const (
+	numMoveBits               = 5
+	numbitModelTotalBits      = 11
+	bitModelTotal        uint = 1 << numbitModelTotalBits
+	numTopBits                = 24
+	topValue             uint = 1 << numTopBits
+)
+
+var (
+	errAlreadyClosed   = errors.New("bcj2: already closed")
+	errNeedFourReaders = errors.New("bcj2: need exactly four readers")
+)
+
+func isJcc(b0, b1 byte) bool {
+	return b0 == 0x0f && (b1&0xf0) == 0x80
+}
+
+func isJ(b0, b1 byte) bool {
+	return (b1&0xfe) == 0xe8 || isJcc(b0, b1)
+}
+
+func index(b0, b1 byte) int {
+	switch b1 {
+	case 0xe8:
+		return int(b0)
+	case 0xe9:
+		return 256
+	default:
+		return 257
+	}
+}
+
+// NewReader returns a new BCJ2 io.ReadCloser.
+func NewReader(_ []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	if len(readers) != 4 {
+		return nil, errNeedFourReaders
+	}
+
+	rc := &readCloser{
+		main:   util.ByteReadCloser(readers[0]),
+		call:   readers[1],
+		jump:   readers[2],
+		rd:     util.ByteReadCloser(readers[3]),
+		nrange: 0xffffffff,
+		buf:    new(bytes.Buffer),
+	}
+	rc.buf.Grow(1 << 16)
+
+	b := make([]byte, 5)
+	if _, err := io.ReadFull(rc.rd, b); err != nil {
+		if !errors.Is(err, io.EOF) {
+			err = fmt.Errorf("bcj2: error reading initial state: %w", err)
+		}
+
+		return nil, err
+	}
+
+	for _, x := range b {
+		rc.code = (rc.code << 8) | uint(x)
+	}
+
+	for i := range rc.sd {
+		rc.sd[i] = bitModelTotal >> 1
+	}
+
+	return rc, nil
+}
+
+func (rc *readCloser) Close() error {
+	if rc.main == nil || rc.call == nil || rc.jump == nil || rc.rd == nil {
+		return errAlreadyClosed
+	}
+
+	//nolint:lll
+	if err := multierror.Append(rc.main.Close(), rc.call.Close(), rc.jump.Close(), rc.rd.Close()).ErrorOrNil(); err != nil {
+		return fmt.Errorf("bcj2: error closing: %w", err)
+	}
+
+	rc.main, rc.call, rc.jump, rc.rd = nil, nil, nil, nil
+
+	return nil
+}
+
+func (rc *readCloser) Read(p []byte) (int, error) {
+	if rc.main == nil || rc.call == nil || rc.jump == nil || rc.rd == nil {
+		return 0, errAlreadyClosed
+	}
+
+	if err := rc.read(); err != nil && !errors.Is(err, io.EOF) {
+		return 0, err
+	}
+
+	n, err := rc.buf.Read(p)
+	if err != nil && !errors.Is(err, io.EOF) {
+		err = fmt.Errorf("bcj2: error reading: %w", err)
+	}
+
+	return n, err
+}
+
+func (rc *readCloser) update() error {
+	if rc.nrange < topValue {
+		b, err := rc.rd.ReadByte()
+		if err != nil && !errors.Is(err, io.EOF) {
+			return fmt.Errorf("bcj2: error reading byte: %w", err)
+		}
+
+		rc.code = (rc.code << 8) | uint(b)
+		rc.nrange <<= 8
+	}
+
+	return nil
+}
+
+func (rc *readCloser) decode(i int) (bool, error) {
+	newBound := (rc.nrange >> numbitModelTotalBits) * rc.sd[i]
+
+	if rc.code < newBound {
+		rc.nrange = newBound
+		rc.sd[i] += (bitModelTotal - rc.sd[i]) >> numMoveBits
+
+		if err := rc.update(); err != nil {
+			return false, err
+		}
+
+		return false, nil
+	}
+
+	rc.nrange -= newBound
+	rc.code -= newBound
+	rc.sd[i] -= rc.sd[i] >> numMoveBits
+
+	if err := rc.update(); err != nil {
+		return false, err
+	}
+
+	return true, nil
+}
+
+//nolint:cyclop,funlen
+func (rc *readCloser) read() error {
+	var (
+		b   byte
+		err error
+	)
+
+	for {
+		if b, err = rc.main.ReadByte(); err != nil {
+			if !errors.Is(err, io.EOF) {
+				err = fmt.Errorf("bcj2: error reading byte: %w", err)
+			}
+
+			return err
+		}
+
+		rc.written++
+		_ = rc.buf.WriteByte(b)
+
+		if isJ(rc.previous, b) {
+			break
+		}
+
+		rc.previous = b
+
+		if rc.buf.Len() == rc.buf.Cap() {
+			return nil
+		}
+	}
+
+	bit, err := rc.decode(index(rc.previous, b))
+	if err != nil {
+		return err
+	}
+
+	//nolint:nestif
+	if bit {
+		var r io.Reader
+		if b == 0xe8 {
+			r = rc.call
+		} else {
+			r = rc.jump
+		}
+
+		var dest uint32
+		if err = binary.Read(r, binary.BigEndian, &dest); err != nil {
+			if !errors.Is(err, io.EOF) {
+				err = fmt.Errorf("bcj2: error reading uint32: %w", err)
+			}
+
+			return err
+		}
+
+		dest -= rc.written + 4
+		_ = binary.Write(rc.buf, binary.LittleEndian, dest)
+
+		rc.previous = byte(dest >> 24)
+		rc.written += 4
+	} else {
+		rc.previous = b
+	}
+
+	return nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/bra/arm.go b/vendor/github.com/bodgit/sevenzip/internal/bra/arm.go
new file mode 100644
index 0000000000..3916a0c297
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/bra/arm.go
@@ -0,0 +1,55 @@
+package bra
+
+import (
+	"encoding/binary"
+	"io"
+)
+
+const armAlignment = 4
+
+type arm struct {
+	ip uint32
+}
+
+func (c *arm) Size() int { return armAlignment }
+
+func (c *arm) Convert(b []byte, encoding bool) int {
+	if len(b) < c.Size() {
+		return 0
+	}
+
+	if c.ip == 0 {
+		c.ip += armAlignment
+	}
+
+	var i int
+
+	for i = 0; i < len(b) & ^(armAlignment-1); i += armAlignment {
+		v := binary.LittleEndian.Uint32(b[i:])
+
+		c.ip += uint32(armAlignment)
+
+		if b[i+3] == 0xeb {
+			v <<= 2
+
+			if encoding {
+				v += c.ip
+			} else {
+				v -= c.ip
+			}
+
+			v >>= 2
+			v &= 0x00ffffff
+			v |= 0xeb000000
+		}
+
+		binary.LittleEndian.PutUint32(b[i:], v)
+	}
+
+	return i
+}
+
+// NewARMReader returns a new ARM io.ReadCloser.
+func NewARMReader(_ []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	return newReader(readers, new(arm))
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/bra/bcj.go b/vendor/github.com/bodgit/sevenzip/internal/bra/bcj.go
new file mode 100644
index 0000000000..05f1fdffbc
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/bra/bcj.go
@@ -0,0 +1,104 @@
+package bra
+
+import (
+	"encoding/binary"
+	"io"
+)
+
+const bcjLookAhead = 4
+
+type bcj struct {
+	ip, state uint32
+}
+
+func (c *bcj) Size() int { return bcjLookAhead + 1 }
+
+func test86MSByte(b byte) bool {
+	return (b+1)&0xfe == 0
+}
+
+//nolint:cyclop,funlen,gocognit
+func (c *bcj) Convert(b []byte, encoding bool) int {
+	if len(b) < c.Size() {
+		return 0
+	}
+
+	var (
+		pos  uint32
+		mask = c.state & 7
+	)
+
+	for {
+		p := pos
+		for ; int(p) < len(b)-bcjLookAhead; p++ {
+			if b[p]&0xfe == 0xe8 {
+				break
+			}
+		}
+
+		d := p - pos
+		pos = p
+
+		if int(p) >= len(b)-bcjLookAhead {
+			if d > 2 {
+				c.state = 0
+			} else {
+				c.state = mask >> d
+			}
+
+			c.ip += pos
+
+			return int(pos)
+		}
+
+		if d > 2 {
+			mask = 0
+		} else {
+			mask >>= d
+			if mask != 0 && (mask > 4 || mask == 3 || test86MSByte(b[p+(mask>>1)+1])) {
+				mask = (mask >> 1) | 4
+				pos++
+
+				continue
+			}
+		}
+
+		//nolint:nestif
+		if test86MSByte(b[p+4]) {
+			v := binary.LittleEndian.Uint32(b[p+1:])
+			cur := c.ip + uint32(c.Size()) + pos //nolint:gosec
+			pos += uint32(c.Size())              //nolint:gosec
+
+			if encoding {
+				v += cur
+			} else {
+				v -= cur
+			}
+
+			if mask != 0 {
+				sh := mask & 6 << 2
+				if test86MSByte(byte(v >> sh)) {
+					v ^= (uint32(0x100) << sh) - 1
+					if encoding {
+						v += cur
+					} else {
+						v -= cur
+					}
+				}
+
+				mask = 0
+			}
+
+			binary.LittleEndian.PutUint32(b[p+1:], v)
+			b[p+4] = 0 - b[p+4]&1
+		} else {
+			mask = (mask >> 1) | 4
+			pos++
+		}
+	}
+}
+
+// NewBCJReader returns a new BCJ io.ReadCloser.
+func NewBCJReader(_ []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	return newReader(readers, new(bcj))
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/bra/bra.go b/vendor/github.com/bodgit/sevenzip/internal/bra/bra.go
new file mode 100644
index 0000000000..a7a77d706f
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/bra/bra.go
@@ -0,0 +1,7 @@
+// Package bra implements the branch rewriting filter for binaries.
+package bra
+
+type converter interface {
+	Size() int
+	Convert(b []byte, encoding bool) int
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/bra/minmax_compat.go b/vendor/github.com/bodgit/sevenzip/internal/bra/minmax_compat.go
new file mode 100644
index 0000000000..8004c6a6e3
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/bra/minmax_compat.go
@@ -0,0 +1,21 @@
+//go:build !1.21
+
+package bra
+
+//nolint:predeclared
+func min(x, y int) int {
+	if x < y {
+		return x
+	}
+
+	return y
+}
+
+//nolint:predeclared
+func max(x, y int) int {
+	if x > y {
+		return x
+	}
+
+	return y
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/bra/ppc.go b/vendor/github.com/bodgit/sevenzip/internal/bra/ppc.go
new file mode 100644
index 0000000000..9d38243fb8
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/bra/ppc.go
@@ -0,0 +1,48 @@
+package bra
+
+import (
+	"encoding/binary"
+	"io"
+)
+
+const ppcAlignment = 4
+
+type ppc struct {
+	ip uint32
+}
+
+func (c *ppc) Size() int { return ppcAlignment }
+
+func (c *ppc) Convert(b []byte, encoding bool) int {
+	if len(b) < c.Size() {
+		return 0
+	}
+
+	var i int
+
+	for i = 0; i < len(b) & ^(ppcAlignment-1); i += ppcAlignment {
+		v := binary.BigEndian.Uint32(b[i:])
+
+		if b[i+0]&0xfc == 0x48 && b[i+3]&3 == 1 {
+			if encoding {
+				v += c.ip
+			} else {
+				v -= c.ip
+			}
+
+			v &= 0x03ffffff
+			v |= 0x48000000
+		}
+
+		c.ip += uint32(ppcAlignment)
+
+		binary.BigEndian.PutUint32(b[i:], v)
+	}
+
+	return i
+}
+
+// NewPPCReader returns a new PPC io.ReadCloser.
+func NewPPCReader(_ []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	return newReader(readers, new(ppc))
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/bra/reader.go b/vendor/github.com/bodgit/sevenzip/internal/bra/reader.go
new file mode 100644
index 0000000000..733333b4d2
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/bra/reader.go
@@ -0,0 +1,72 @@
+package bra
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+)
+
+type readCloser struct {
+	rc   io.ReadCloser
+	buf  bytes.Buffer
+	n    int
+	conv converter
+}
+
+var (
+	errAlreadyClosed = errors.New("bra: already closed")
+	errNeedOneReader = errors.New("bra: need exactly one reader")
+)
+
+func (rc *readCloser) Close() error {
+	if rc.rc == nil {
+		return errAlreadyClosed
+	}
+
+	if err := rc.rc.Close(); err != nil {
+		return fmt.Errorf("bra: error closing: %w", err)
+	}
+
+	rc.rc = nil
+
+	return nil
+}
+
+func (rc *readCloser) Read(p []byte) (int, error) {
+	if rc.rc == nil {
+		return 0, errAlreadyClosed
+	}
+
+	if _, err := io.CopyN(&rc.buf, rc.rc, int64(max(len(p), rc.conv.Size())-rc.buf.Len())); err != nil {
+		if !errors.Is(err, io.EOF) {
+			return 0, fmt.Errorf("bra: error buffering: %w", err)
+		}
+
+		if rc.buf.Len() < rc.conv.Size() {
+			rc.n = rc.buf.Len()
+		}
+	}
+
+	rc.n += rc.conv.Convert(rc.buf.Bytes()[rc.n:], false)
+
+	n, err := rc.buf.Read(p[:min(rc.n, len(p))])
+	if err != nil && !errors.Is(err, io.EOF) {
+		err = fmt.Errorf("bra: error reading: %w", err)
+	}
+
+	rc.n -= n
+
+	return n, err
+}
+
+func newReader(readers []io.ReadCloser, conv converter) (io.ReadCloser, error) {
+	if len(readers) != 1 {
+		return nil, errNeedOneReader
+	}
+
+	return &readCloser{
+		rc:   readers[0],
+		conv: conv,
+	}, nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/bra/sparc.go b/vendor/github.com/bodgit/sevenzip/internal/bra/sparc.go
new file mode 100644
index 0000000000..8aa45536d2
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/bra/sparc.go
@@ -0,0 +1,53 @@
+package bra
+
+import (
+	"encoding/binary"
+	"io"
+)
+
+const sparcAlignment = 4
+
+type sparc struct {
+	ip uint32
+}
+
+func (c *sparc) Size() int { return sparcAlignment }
+
+func (c *sparc) Convert(b []byte, encoding bool) int {
+	if len(b) < c.Size() {
+		return 0
+	}
+
+	var i int
+
+	for i = 0; i < len(b) & ^(sparcAlignment-1); i += sparcAlignment {
+		v := binary.BigEndian.Uint32(b[i:])
+
+		if (b[i+0] == 0x40 && b[i+1]&0xc0 == 0) || (b[i+0] == 0x7f && b[i+1] >= 0xc0) {
+			v <<= 2
+
+			if encoding {
+				v += c.ip
+			} else {
+				v -= c.ip
+			}
+
+			v &= 0x01ffffff
+			v -= uint32(1) << 24
+			v ^= 0xff000000
+			v >>= 2
+			v |= 0x40000000
+		}
+
+		c.ip += uint32(sparcAlignment)
+
+		binary.BigEndian.PutUint32(b[i:], v)
+	}
+
+	return i
+}
+
+// NewSPARCReader returns a new SPARC io.ReadCloser.
+func NewSPARCReader(_ []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	return newReader(readers, new(sparc))
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/brotli/reader.go b/vendor/github.com/bodgit/sevenzip/internal/brotli/reader.go
new file mode 100644
index 0000000000..a07b7641cb
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/brotli/reader.go
@@ -0,0 +1,113 @@
+// Package brotli implements the Brotli decompressor.
+package brotli
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"sync"
+
+	"github.com/andybalholm/brotli"
+	"github.com/bodgit/plumbing"
+)
+
+type readCloser struct {
+	c io.Closer
+	r *brotli.Reader
+}
+
+const (
+	frameMagic  uint32 = 0x184d2a50
+	frameSize   uint32 = 8
+	brotliMagic uint16 = 0x5242 // 'B', 'R'
+)
+
+var (
+	//nolint:gochecknoglobals
+	brotliReaderPool sync.Pool
+
+	errAlreadyClosed = errors.New("brotli: already closed")
+	errNeedOneReader = errors.New("brotli: need exactly one reader")
+)
+
+// This isn't part of the Brotli format but is prepended by the 7-zip implementation.
+type headerFrame struct {
+	FrameMagic       uint32
+	FrameSize        uint32
+	CompressedSize   uint32
+	BrotliMagic      uint16
+	UncompressedSize uint16 // * 64 KB
+}
+
+func (rc *readCloser) Close() error {
+	if rc.c == nil || rc.r == nil {
+		return errAlreadyClosed
+	}
+
+	if err := rc.c.Close(); err != nil {
+		return fmt.Errorf("brotli: error closing: %w", err)
+	}
+
+	brotliReaderPool.Put(rc.r)
+	rc.c, rc.r = nil, nil
+
+	return nil
+}
+
+func (rc *readCloser) Read(p []byte) (int, error) {
+	if rc.r == nil {
+		return 0, errAlreadyClosed
+	}
+
+	n, err := rc.r.Read(p)
+	if err != nil && !errors.Is(err, io.EOF) {
+		err = fmt.Errorf("brotli: error reading: %w", err)
+	}
+
+	return n, err
+}
+
+// NewReader returns a new Brotli io.ReadCloser.
+func NewReader(_ []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	if len(readers) != 1 {
+		return nil, errNeedOneReader
+	}
+
+	hr, b := new(headerFrame), new(bytes.Buffer)
+	b.Grow(binary.Size(hr))
+
+	// The 7-Zip Brotli compressor adds a 16 byte frame to the beginning of
+	// the data which will confuse a pure Brotli implementation. Read it
+	// but keep a copy so we can add it back if it doesn't look right
+	if err := binary.Read(io.TeeReader(readers[0], b), binary.LittleEndian, hr); err != nil {
+		if !errors.Is(err, io.EOF) {
+			err = fmt.Errorf("brotli: error reading frame: %w", err)
+		}
+
+		return nil, err
+	}
+
+	var reader io.ReadCloser
+
+	// If the header looks right, continue reading from that point
+	// onwards, otherwise prepend it again and hope for the best
+	if hr.FrameMagic == frameMagic && hr.FrameSize == frameSize && hr.BrotliMagic == brotliMagic {
+		reader = readers[0]
+	} else {
+		reader = plumbing.MultiReadCloser(io.NopCloser(b), readers[0])
+	}
+
+	r, ok := brotliReaderPool.Get().(*brotli.Reader)
+	if ok {
+		_ = r.Reset(reader)
+	} else {
+		r = brotli.NewReader(reader)
+	}
+
+	return &readCloser{
+		c: readers[0],
+		r: r,
+	}, nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/bzip2/reader.go b/vendor/github.com/bodgit/sevenzip/internal/bzip2/reader.go
new file mode 100644
index 0000000000..3e824984c9
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/bzip2/reader.go
@@ -0,0 +1,58 @@
+// Package bzip2 implements the Bzip2 decompressor.
+package bzip2
+
+import (
+	"compress/bzip2"
+	"errors"
+	"fmt"
+	"io"
+)
+
+type readCloser struct {
+	c io.Closer
+	r io.Reader
+}
+
+var (
+	errAlreadyClosed = errors.New("bzip2: already closed")
+	errNeedOneReader = errors.New("bzip2: need exactly one reader")
+)
+
+func (rc *readCloser) Close() error {
+	if rc.c == nil || rc.r == nil {
+		return errAlreadyClosed
+	}
+
+	if err := rc.c.Close(); err != nil {
+		return fmt.Errorf("bzip2: error closing: %w", err)
+	}
+
+	rc.c, rc.r = nil, nil
+
+	return nil
+}
+
+func (rc *readCloser) Read(p []byte) (int, error) {
+	if rc.r == nil {
+		return 0, errAlreadyClosed
+	}
+
+	n, err := rc.r.Read(p)
+	if err != nil && !errors.Is(err, io.EOF) {
+		err = fmt.Errorf("bzip2: error reading: %w", err)
+	}
+
+	return n, err
+}
+
+// NewReader returns a new bzip2 io.ReadCloser.
+func NewReader(_ []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	if len(readers) != 1 {
+		return nil, errNeedOneReader
+	}
+
+	return &readCloser{
+		c: readers[0],
+		r: bzip2.NewReader(readers[0]),
+	}, nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/deflate/reader.go b/vendor/github.com/bodgit/sevenzip/internal/deflate/reader.go
new file mode 100644
index 0000000000..c0b4e4dfcd
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/deflate/reader.go
@@ -0,0 +1,78 @@
+// Package deflate implements the Deflate decompressor.
+package deflate
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"sync"
+
+	"github.com/bodgit/sevenzip/internal/util"
+	"github.com/hashicorp/go-multierror"
+	"github.com/klauspost/compress/flate"
+)
+
+type readCloser struct {
+	c  io.Closer
+	fr io.ReadCloser
+}
+
+var (
+	//nolint:gochecknoglobals
+	flateReaderPool sync.Pool
+
+	errAlreadyClosed = errors.New("deflate: already closed")
+	errNeedOneReader = errors.New("deflate: need exactly one reader")
+)
+
+func (rc *readCloser) Close() error {
+	if rc.c == nil || rc.fr == nil {
+		return errAlreadyClosed
+	}
+
+	if err := multierror.Append(rc.fr.Close(), rc.c.Close()).ErrorOrNil(); err != nil {
+		return fmt.Errorf("deflate: error closing: %w", err)
+	}
+
+	flateReaderPool.Put(rc.fr)
+	rc.c, rc.fr = nil, nil
+
+	return nil
+}
+
+func (rc *readCloser) Read(p []byte) (int, error) {
+	if rc.c == nil || rc.fr == nil {
+		return 0, errAlreadyClosed
+	}
+
+	n, err := rc.fr.Read(p)
+	if err != nil && !errors.Is(err, io.EOF) {
+		err = fmt.Errorf("deflate: error reading: %w", err)
+	}
+
+	return n, err
+}
+
+// NewReader returns a new DEFLATE io.ReadCloser.
+func NewReader(_ []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	if len(readers) != 1 {
+		return nil, errNeedOneReader
+	}
+
+	fr, ok := flateReaderPool.Get().(io.ReadCloser)
+	if ok {
+		frf, ok := fr.(flate.Resetter)
+		if ok {
+			if err := frf.Reset(util.ByteReadCloser(readers[0]), nil); err != nil {
+				return nil, fmt.Errorf("deflate: error resetting: %w", err)
+			}
+		}
+	} else {
+		fr = flate.NewReader(util.ByteReadCloser(readers[0]))
+	}
+
+	return &readCloser{
+		c:  readers[0],
+		fr: fr,
+	}, nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/delta/reader.go b/vendor/github.com/bodgit/sevenzip/internal/delta/reader.go
new file mode 100644
index 0000000000..0926a175cb
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/delta/reader.go
@@ -0,0 +1,89 @@
+// Package delta implements the Delta filter.
+package delta
+
+import (
+	"errors"
+	"fmt"
+	"io"
+)
+
+type readCloser struct {
+	rc    io.ReadCloser
+	state [stateSize]byte
+	delta int
+}
+
+const (
+	stateSize = 256
+)
+
+var (
+	errAlreadyClosed          = errors.New("delta: already closed")
+	errNeedOneReader          = errors.New("delta: need exactly one reader")
+	errInsufficientProperties = errors.New("delta: not enough properties")
+)
+
+func (rc *readCloser) Close() error {
+	if rc.rc == nil {
+		return errAlreadyClosed
+	}
+
+	if err := rc.rc.Close(); err != nil {
+		return fmt.Errorf("delta: error closing: %w", err)
+	}
+
+	rc.rc = nil
+
+	return nil
+}
+
+func (rc *readCloser) Read(p []byte) (int, error) {
+	if rc.rc == nil {
+		return 0, errAlreadyClosed
+	}
+
+	n, err := rc.rc.Read(p)
+	if err != nil && !errors.Is(err, io.EOF) {
+		return n, fmt.Errorf("delta: error reading: %w", err)
+	}
+
+	var (
+		buffer [stateSize]byte
+		j      int
+	)
+
+	copy(buffer[:], rc.state[:rc.delta])
+
+	for i := 0; i < n; {
+		for j = 0; j < rc.delta && i < n; i++ {
+			p[i] = buffer[j] + p[i]
+			buffer[j] = p[i]
+			j++
+		}
+	}
+
+	if j == rc.delta {
+		j = 0
+	}
+
+	copy(rc.state[:], buffer[j:rc.delta])
+	copy(rc.state[rc.delta-j:], buffer[:j])
+
+	return n, nil
+}
+
+// NewReader returns a new Delta io.ReadCloser.
+func NewReader(p []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	if len(readers) != 1 {
+		return nil, errNeedOneReader
+	}
+
+	if len(p) != 1 {
+		return nil, errInsufficientProperties
+	}
+
+	return &readCloser{
+		rc:    readers[0],
+		delta: int(p[0] + 1),
+	}, nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/lz4/reader.go b/vendor/github.com/bodgit/sevenzip/internal/lz4/reader.go
new file mode 100644
index 0000000000..4299334230
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/lz4/reader.go
@@ -0,0 +1,71 @@
+// Package lz4 implements the LZ4 decompressor.
+package lz4
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"sync"
+
+	lz4 "github.com/pierrec/lz4/v4"
+)
+
+type readCloser struct {
+	c io.Closer
+	r *lz4.Reader
+}
+
+var (
+	//nolint:gochecknoglobals
+	lz4ReaderPool sync.Pool
+
+	errAlreadyClosed = errors.New("lz4: already closed")
+	errNeedOneReader = errors.New("lz4: need exactly one reader")
+)
+
+func (rc *readCloser) Close() error {
+	if rc.c == nil || rc.r == nil {
+		return errAlreadyClosed
+	}
+
+	if err := rc.c.Close(); err != nil {
+		return fmt.Errorf("lz4: error closing: %w", err)
+	}
+
+	lz4ReaderPool.Put(rc.r)
+	rc.c, rc.r = nil, nil
+
+	return nil
+}
+
+func (rc *readCloser) Read(p []byte) (int, error) {
+	if rc.r == nil {
+		return 0, errAlreadyClosed
+	}
+
+	n, err := rc.r.Read(p)
+	if err != nil && !errors.Is(err, io.EOF) {
+		err = fmt.Errorf("lz4: error reading: %w", err)
+	}
+
+	return n, err
+}
+
+// NewReader returns a new LZ4 io.ReadCloser.
+func NewReader(_ []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	if len(readers) != 1 {
+		return nil, errNeedOneReader
+	}
+
+	r, ok := lz4ReaderPool.Get().(*lz4.Reader)
+	if ok {
+		r.Reset(readers[0])
+	} else {
+		r = lz4.NewReader(readers[0])
+	}
+
+	return &readCloser{
+		c: readers[0],
+		r: r,
+	}, nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/lzma/reader.go b/vendor/github.com/bodgit/sevenzip/internal/lzma/reader.go
new file mode 100644
index 0000000000..b8d277c095
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/lzma/reader.go
@@ -0,0 +1,112 @@
+// Package lzma implements the LZMA decompressor.
+package lzma
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+
+	"github.com/ulikunitz/xz/lzma"
+)
+
+type readCloser struct {
+	c io.Closer
+	r io.Reader
+}
+
+var (
+	errAlreadyClosed = errors.New("lzma: already closed")
+	errNeedOneReader = errors.New("lzma: need exactly one reader")
+)
+
+func (rc *readCloser) Close() error {
+	if rc.c == nil || rc.r == nil {
+		return errAlreadyClosed
+	}
+
+	if err := rc.c.Close(); err != nil {
+		return fmt.Errorf("lzma: error closing: %w", err)
+	}
+
+	rc.c, rc.r = nil, nil
+
+	return nil
+}
+
+func (rc *readCloser) Read(p []byte) (int, error) {
+	if rc.r == nil {
+		return 0, errAlreadyClosed
+	}
+
+	n, err := rc.r.Read(p)
+	if err != nil && !errors.Is(err, io.EOF) {
+		err = fmt.Errorf("lzma: error reading: %w", err)
+	}
+
+	return n, err
+}
+
+// NewReader returns a new LZMA io.ReadCloser.
+func NewReader(p []byte, s uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	if len(readers) != 1 {
+		return nil, errNeedOneReader
+	}
+
+	h := bytes.NewBuffer(p)
+	_ = binary.Write(h, binary.LittleEndian, s)
+
+	lr, err := lzma.NewReader(multiReader(h, readers[0]))
+	if err != nil {
+		return nil, fmt.Errorf("lzma: error creating reader: %w", err)
+	}
+
+	return &readCloser{
+		c: readers[0],
+		r: lr,
+	}, nil
+}
+
+func multiReader(b *bytes.Buffer, rc io.ReadCloser) io.Reader {
+	mr := io.MultiReader(b, rc)
+
+	if br, ok := rc.(io.ByteReader); ok {
+		return &multiByteReader{
+			b:  b,
+			br: br,
+			mr: mr,
+		}
+	}
+
+	return mr
+}
+
+type multiByteReader struct {
+	b  *bytes.Buffer
+	br io.ByteReader
+	mr io.Reader
+}
+
+func (m *multiByteReader) ReadByte() (b byte, err error) {
+	if m.b.Len() > 0 {
+		b, err = m.b.ReadByte()
+	} else {
+		b, err = m.br.ReadByte()
+	}
+
+	if err != nil {
+		err = fmt.Errorf("lzma: error multi byte reading: %w", err)
+	}
+
+	return b, err
+}
+
+func (m *multiByteReader) Read(p []byte) (int, error) {
+	n, err := m.mr.Read(p)
+	if err != nil {
+		err = fmt.Errorf("lzma: error multi reading: %w", err)
+	}
+
+	return n, err
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/lzma2/reader.go b/vendor/github.com/bodgit/sevenzip/internal/lzma2/reader.go
new file mode 100644
index 0000000000..3f2e7be499
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/lzma2/reader.go
@@ -0,0 +1,77 @@
+// Package lzma2 implements the LZMA2 decompressor.
+package lzma2
+
+import (
+	"errors"
+	"fmt"
+	"io"
+
+	"github.com/ulikunitz/xz/lzma"
+)
+
+type readCloser struct {
+	c io.Closer
+	r io.Reader
+}
+
+var (
+	errAlreadyClosed          = errors.New("lzma2: already closed")
+	errNeedOneReader          = errors.New("lzma2: need exactly one reader")
+	errInsufficientProperties = errors.New("lzma2: not enough properties")
+)
+
+func (rc *readCloser) Close() error {
+	if rc.c == nil || rc.r == nil {
+		return errAlreadyClosed
+	}
+
+	if err := rc.c.Close(); err != nil {
+		return fmt.Errorf("lzma2: error closing: %w", err)
+	}
+
+	rc.c, rc.r = nil, nil
+
+	return nil
+}
+
+func (rc *readCloser) Read(p []byte) (int, error) {
+	if rc.r == nil {
+		return 0, errAlreadyClosed
+	}
+
+	n, err := rc.r.Read(p)
+	if err != nil && !errors.Is(err, io.EOF) {
+		err = fmt.Errorf("lzma2: error reading: %w", err)
+	}
+
+	return n, err
+}
+
+// NewReader returns a new LZMA2 io.ReadCloser.
+func NewReader(p []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	if len(readers) != 1 {
+		return nil, errNeedOneReader
+	}
+
+	if len(p) != 1 {
+		return nil, errInsufficientProperties
+	}
+
+	config := lzma.Reader2Config{
+		DictCap: (2 | (int(p[0]) & 1)) << (p[0]/2 + 11), // This gem came from Lzma2Dec.c
+	}
+
+	if err := config.Verify(); err != nil {
+		return nil, fmt.Errorf("lzma2: error verifying config: %w", err)
+	}
+
+	lr, err := config.NewReader2(readers[0])
+	if err != nil {
+		return nil, fmt.Errorf("lzma2: error creating reader: %w", err)
+	}
+
+	return &readCloser{
+		c: readers[0],
+		r: lr,
+	}, nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/pool/pool.go b/vendor/github.com/bodgit/sevenzip/internal/pool/pool.go
new file mode 100644
index 0000000000..bd7194deac
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/pool/pool.go
@@ -0,0 +1,138 @@
+// Package pool implements the reader pooling.
+package pool
+
+import (
+	"container/list"
+	"runtime"
+	"sort"
+	"sync"
+
+	"github.com/bodgit/sevenzip/internal/util"
+)
+
+// Pooler is the interface implemented by a pool.
+type Pooler interface {
+	Get(offset int64) (util.SizeReadSeekCloser, bool)
+	Put(offset int64, rc util.SizeReadSeekCloser) (bool, error)
+}
+
+// Constructor is the function prototype used to instantiate a pool.
+type Constructor func() (Pooler, error)
+
+type noopPool struct{}
+
+// NewNoopPool returns a Pooler that doesn't actually pool anything.
+func NewNoopPool() (Pooler, error) {
+	return new(noopPool), nil
+}
+
+func (noopPool) Get(_ int64) (util.SizeReadSeekCloser, bool) {
+	return nil, false
+}
+
+func (noopPool) Put(_ int64, rc util.SizeReadSeekCloser) (bool, error) {
+	return false, rc.Close() //nolint:wrapcheck
+}
+
+type pool struct {
+	mutex     sync.Mutex
+	size      int
+	evictList *list.List
+	items     map[int64]*list.Element
+}
+
+type entry struct {
+	key   int64
+	value util.SizeReadSeekCloser
+}
+
+// NewPool returns a Pooler that uses a LRU strategy to maintain a fixed pool
+// of util.SizeReadSeekCloser's keyed by their stream offset.
+func NewPool() (Pooler, error) {
+	return &pool{
+		size:      runtime.NumCPU(),
+		evictList: list.New(),
+		items:     make(map[int64]*list.Element),
+	}, nil
+}
+
+func (p *pool) Get(offset int64) (util.SizeReadSeekCloser, bool) {
+	p.mutex.Lock()
+	defer p.mutex.Unlock()
+
+	if ent, ok := p.items[offset]; ok {
+		_ = p.removeElement(ent, false)
+
+		return ent.Value.(*entry).value, true //nolint:forcetypeassert
+	}
+
+	// Sort keys in descending order
+	keys := p.keys()
+	sort.Slice(keys, func(i, j int) bool { return keys[i] > keys[j] })
+
+	for _, k := range keys {
+		// First key less than offset is the closest
+		if k < offset {
+			ent := p.items[k]
+			_ = p.removeElement(ent, false)
+
+			return ent.Value.(*entry).value, true //nolint:forcetypeassert
+		}
+	}
+
+	return nil, false
+}
+
+func (p *pool) Put(offset int64, rc util.SizeReadSeekCloser) (bool, error) {
+	p.mutex.Lock()
+	defer p.mutex.Unlock()
+
+	if _, ok := p.items[offset]; ok {
+		return false, nil
+	}
+
+	ent := &entry{offset, rc}
+	entry := p.evictList.PushFront(ent)
+	p.items[offset] = entry
+
+	var err error
+
+	evict := p.evictList.Len() > p.size
+	if evict {
+		err = p.removeOldest()
+	}
+
+	return evict, err
+}
+
+func (p *pool) keys() []int64 {
+	keys := make([]int64, len(p.items))
+	i := 0
+
+	for ent := p.evictList.Back(); ent != nil; ent = ent.Prev() {
+		keys[i] = ent.Value.(*entry).key //nolint:forcetypeassert
+		i++
+	}
+
+	return keys
+}
+
+func (p *pool) removeOldest() error {
+	if ent := p.evictList.Back(); ent != nil {
+		return p.removeElement(ent, true)
+	}
+
+	return nil
+}
+
+func (p *pool) removeElement(e *list.Element, cb bool) error {
+	p.evictList.Remove(e)
+	kv := e.Value.(*entry) //nolint:forcetypeassert
+	delete(p.items, kv.key)
+
+	if cb {
+		return kv.value.Close() //nolint:wrapcheck
+	}
+
+	return nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/util/checksum.go b/vendor/github.com/bodgit/sevenzip/internal/util/checksum.go
new file mode 100644
index 0000000000..978380cba5
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/util/checksum.go
@@ -0,0 +1,8 @@
+package util
+
+import "bytes"
+
+// CRC32Equal compares CRC32 checksums.
+func CRC32Equal(b []byte, c uint32) bool {
+	return bytes.Equal(b, []byte{byte(0xff & (c >> 24)), byte(0xff & (c >> 16)), byte(0xff & (c >> 8)), byte(0xff & c)})
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/util/reader.go b/vendor/github.com/bodgit/sevenzip/internal/util/reader.go
new file mode 100644
index 0000000000..21d2c94946
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/util/reader.go
@@ -0,0 +1,68 @@
+// Package util implements various utility types and interfaces.
+package util
+
+import "io"
+
+// SizeReadSeekCloser is an io.Reader, io.Seeker, and io.Closer with a Size
+// method.
+type SizeReadSeekCloser interface {
+	io.Reader
+	io.Seeker
+	io.Closer
+	Size() int64
+}
+
+// Reader is both an io.Reader and io.ByteReader.
+type Reader interface {
+	io.Reader
+	io.ByteReader
+}
+
+// ReadCloser is a Reader that is also an io.Closer.
+type ReadCloser interface {
+	Reader
+	io.Closer
+}
+
+type nopCloser struct {
+	Reader
+}
+
+func (nopCloser) Close() error {
+	return nil
+}
+
+// NopCloser returns a ReadCloser with a no-op Close method wrapping the
+// provided Reader r.
+func NopCloser(r Reader) ReadCloser {
+	return &nopCloser{r}
+}
+
+type byteReadCloser struct {
+	io.ReadCloser
+}
+
+func (rc *byteReadCloser) ReadByte() (byte, error) {
+	var b [1]byte
+
+	n, err := rc.Read(b[:])
+	if err != nil {
+		return 0, err //nolint:wrapcheck
+	}
+
+	if n == 0 {
+		return 0, io.ErrNoProgress
+	}
+
+	return b[0], nil
+}
+
+// ByteReadCloser returns a ReadCloser either by returning the io.ReadCloser
+// r if it implements the interface, or wrapping it with a ReadByte method.
+func ByteReadCloser(r io.ReadCloser) ReadCloser {
+	if rc, ok := r.(ReadCloser); ok {
+		return rc
+	}
+
+	return &byteReadCloser{r}
+}
diff --git a/vendor/github.com/bodgit/sevenzip/internal/zstd/reader.go b/vendor/github.com/bodgit/sevenzip/internal/zstd/reader.go
new file mode 100644
index 0000000000..6817f1a373
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/internal/zstd/reader.go
@@ -0,0 +1,80 @@
+// Package zstd implements the Zstandard decompressor.
+package zstd
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"runtime"
+	"sync"
+
+	"github.com/klauspost/compress/zstd"
+)
+
+type readCloser struct {
+	c io.Closer
+	r *zstd.Decoder
+}
+
+var (
+	//nolint:gochecknoglobals
+	zstdReaderPool sync.Pool
+
+	errAlreadyClosed = errors.New("zstd: already closed")
+	errNeedOneReader = errors.New("zstd: need exactly one reader")
+)
+
+func (rc *readCloser) Close() error {
+	if rc.c == nil {
+		return errAlreadyClosed
+	}
+
+	if err := rc.c.Close(); err != nil {
+		return fmt.Errorf("zstd: error closing: %w", err)
+	}
+
+	zstdReaderPool.Put(rc.r)
+	rc.c, rc.r = nil, nil
+
+	return nil
+}
+
+func (rc *readCloser) Read(p []byte) (int, error) {
+	if rc.r == nil {
+		return 0, errAlreadyClosed
+	}
+
+	n, err := rc.r.Read(p)
+	if err != nil && !errors.Is(err, io.EOF) {
+		err = fmt.Errorf("zstd: error reading: %w", err)
+	}
+
+	return n, err
+}
+
+// NewReader returns a new Zstandard io.ReadCloser.
+func NewReader(_ []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	if len(readers) != 1 {
+		return nil, errNeedOneReader
+	}
+
+	var err error
+
+	r, ok := zstdReaderPool.Get().(*zstd.Decoder)
+	if ok {
+		if err = r.Reset(readers[0]); err != nil {
+			return nil, fmt.Errorf("zstd: error resetting: %w", err)
+		}
+	} else {
+		if r, err = zstd.NewReader(readers[0]); err != nil {
+			return nil, fmt.Errorf("zstd: error creating reader: %w", err)
+		}
+
+		runtime.SetFinalizer(r, (*zstd.Decoder).Close)
+	}
+
+	return &readCloser{
+		c: readers[0],
+		r: r,
+	}, nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/reader.go b/vendor/github.com/bodgit/sevenzip/reader.go
new file mode 100644
index 0000000000..635f5c2034
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/reader.go
@@ -0,0 +1,828 @@
+// Package sevenzip provides read access to 7-zip archives.
+package sevenzip
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"io/fs"
+	"os"
+	"path"
+	"path/filepath"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/bodgit/plumbing"
+	"github.com/bodgit/sevenzip/internal/pool"
+	"github.com/bodgit/sevenzip/internal/util"
+	"github.com/hashicorp/go-multierror"
+	"go4.org/readerutil"
+)
+
+var (
+	errFormat          = errors.New("sevenzip: not a valid 7-zip file")
+	errChecksum        = errors.New("sevenzip: checksum error")
+	errTooMuch         = errors.New("sevenzip: too much data")
+	errNegativeSize    = errors.New("sevenzip: size cannot be negative")
+	errOneHeaderStream = errors.New("sevenzip: expected only one folder in header stream")
+)
+
+// ReadError is used to wrap read I/O errors.
+type ReadError struct {
+	// Encrypted is a hint that there is encryption involved.
+	Encrypted bool
+	Err       error
+}
+
+func (e ReadError) Error() string {
+	return fmt.Sprintf("sevenzip: read error: %v", e.Err)
+}
+
+func (e ReadError) Unwrap() error {
+	return e.Err
+}
+
+// A Reader serves content from a 7-Zip archive.
+type Reader struct {
+	r     io.ReaderAt
+	start int64
+	end   int64
+	si    *streamsInfo
+	p     string
+	File  []*File
+	pool  []pool.Pooler
+
+	fileListOnce sync.Once
+	fileList     []fileListEntry
+}
+
+// A ReadCloser is a [Reader] that must be closed when no longer needed.
+type ReadCloser struct {
+	f []*os.File
+	Reader
+}
+
+// A File is a single file in a 7-Zip archive. The file information is in the
+// embedded [FileHeader]. The file content can be accessed by calling
+// [File.Open].
+type File struct {
+	FileHeader
+	zip    *Reader
+	folder int
+	offset int64
+}
+
+type fileReader struct {
+	rc util.SizeReadSeekCloser
+	f  *File
+	n  int64
+}
+
+func (fr *fileReader) Stat() (fs.FileInfo, error) {
+	return headerFileInfo{&fr.f.FileHeader}, nil
+}
+
+func (fr *fileReader) Read(p []byte) (int, error) {
+	if len(p) == 0 {
+		return 0, nil
+	}
+
+	if fr.n <= 0 {
+		return 0, io.EOF
+	}
+
+	if int64(len(p)) > fr.n {
+		p = p[0:fr.n]
+	}
+
+	n, err := fr.rc.Read(p)
+	fr.n -= int64(n)
+
+	if err != nil && !errors.Is(err, io.EOF) {
+		e := &ReadError{
+			Err: err,
+		}
+
+		if frc, ok := fr.rc.(*folderReadCloser); ok {
+			e.Encrypted = frc.hasEncryption
+		}
+
+		return n, e
+	}
+
+	return n, err //nolint:wrapcheck
+}
+
+func (fr *fileReader) Close() error {
+	if fr.rc == nil {
+		return nil
+	}
+
+	offset, err := fr.rc.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return fmt.Errorf("sevenzip: error seeking current position: %w", err)
+	}
+
+	if offset == fr.rc.Size() { // EOF reached
+		if err := fr.rc.Close(); err != nil {
+			return fmt.Errorf("sevenzip: error closing: %w", err)
+		}
+	} else {
+		f := fr.f
+		if _, err := f.zip.pool[f.folder].Put(offset, fr.rc); err != nil {
+			return fmt.Errorf("sevenzip: error adding to pool: %w", err)
+		}
+	}
+
+	fr.rc = nil
+
+	return nil
+}
+
+// Open returns an [io.ReadCloser] that provides access to the [File]'s
+// contents. Multiple files may be read concurrently.
+func (f *File) Open() (io.ReadCloser, error) {
+	if f.FileHeader.isEmptyStream || f.FileHeader.isEmptyFile {
+		// Return empty reader for directory or empty file
+		return io.NopCloser(bytes.NewReader(nil)), nil
+	}
+
+	rc, _ := f.zip.pool[f.folder].Get(f.offset)
+	if rc == nil {
+		var (
+			encrypted bool
+			err       error
+		)
+
+		rc, _, encrypted, err = f.zip.folderReader(f.zip.si, f.folder)
+		if err != nil {
+			return nil, &ReadError{
+				Encrypted: encrypted,
+				Err:       err,
+			}
+		}
+	}
+
+	if _, err := rc.Seek(f.offset, io.SeekStart); err != nil {
+		e := &ReadError{
+			Err: err,
+		}
+
+		if fr, ok := rc.(*folderReadCloser); ok {
+			e.Encrypted = fr.hasEncryption
+		}
+
+		return nil, e
+	}
+
+	return &fileReader{
+		rc: rc,
+		f:  f,
+		n:  int64(f.UncompressedSize), //nolint:gosec
+	}, nil
+}
+
+// OpenReaderWithPassword will open the 7-zip file specified by name using
+// password as the basis of the decryption key and return a [*ReadCloser]. If
+// name has a ".001" suffix it is assumed there are multiple volumes and each
+// sequential volume will be opened.
+//
+//nolint:cyclop,funlen
+func OpenReaderWithPassword(name, password string) (*ReadCloser, error) {
+	f, err := os.Open(filepath.Clean(name))
+	if err != nil {
+		return nil, fmt.Errorf("sevenzip: error opening: %w", err)
+	}
+
+	info, err := f.Stat()
+	if err != nil {
+		err = multierror.Append(err, f.Close())
+
+		return nil, fmt.Errorf("sevenzip: error retrieving file info: %w", err)
+	}
+
+	var reader io.ReaderAt = f
+
+	size := info.Size()
+	files := []*os.File{f}
+
+	if ext := filepath.Ext(name); ext == ".001" {
+		sr := []readerutil.SizeReaderAt{io.NewSectionReader(f, 0, size)}
+
+		for i := 2; true; i++ {
+			f, err := os.Open(fmt.Sprintf("%s.%03d", strings.TrimSuffix(name, ext), i))
+			if err != nil {
+				if errors.Is(err, fs.ErrNotExist) {
+					break
+				}
+
+				for _, file := range files {
+					err = multierror.Append(err, file.Close())
+				}
+
+				return nil, fmt.Errorf("sevenzip: error opening: %w", err)
+			}
+
+			files = append(files, f)
+
+			info, err = f.Stat()
+			if err != nil {
+				for _, file := range files {
+					err = multierror.Append(err, file.Close())
+				}
+
+				return nil, fmt.Errorf("sevenzip: error retrieving file info: %w", err)
+			}
+
+			sr = append(sr, io.NewSectionReader(f, 0, info.Size()))
+		}
+
+		mr := readerutil.NewMultiReaderAt(sr...)
+		reader, size = mr, mr.Size()
+	}
+
+	r := new(ReadCloser)
+	r.p = password
+
+	if err := r.init(reader, size); err != nil {
+		for _, file := range files {
+			err = multierror.Append(err, file.Close())
+		}
+
+		return nil, fmt.Errorf("sevenzip: error initialising: %w", err)
+	}
+
+	r.f = files
+
+	return r, nil
+}
+
+// OpenReader will open the 7-zip file specified by name and return a
+// [*ReadCloser]. If name has a ".001" suffix it is assumed there are multiple
+// volumes and each sequential volume will be opened.
+func OpenReader(name string) (*ReadCloser, error) {
+	return OpenReaderWithPassword(name, "")
+}
+
+// NewReaderWithPassword returns a new [*Reader] reading from r using password
+// as the basis of the decryption key, which is assumed to have the given size
+// in bytes.
+func NewReaderWithPassword(r io.ReaderAt, size int64, password string) (*Reader, error) {
+	if size < 0 {
+		return nil, errNegativeSize
+	}
+
+	zr := new(Reader)
+	zr.p = password
+
+	if err := zr.init(r, size); err != nil {
+		return nil, err
+	}
+
+	return zr, nil
+}
+
+// NewReader returns a new [*Reader] reading from r, which is assumed to have
+// the given size in bytes.
+func NewReader(r io.ReaderAt, size int64) (*Reader, error) {
+	return NewReaderWithPassword(r, size, "")
+}
+
+func (z *Reader) folderReader(si *streamsInfo, f int) (*folderReadCloser, uint32, bool, error) {
+	// Create a SectionReader covering all of the streams data
+	return si.FolderReader(io.NewSectionReader(z.r, z.start, z.end-z.start), f, z.p)
+}
+
+const (
+	chunkSize   = 4096
+	searchLimit = 1 << 20 // 1 MiB
+)
+
+func findSignature(r io.ReaderAt, search []byte) ([]int64, error) {
+	chunk := make([]byte, chunkSize+len(search))
+	offsets := make([]int64, 0, 2)
+
+	for offset := int64(0); offset < searchLimit; offset += chunkSize {
+		n, err := r.ReadAt(chunk, offset)
+
+		for i := 0; ; {
+			idx := bytes.Index(chunk[i:n], search)
+			if idx == -1 {
+				break
+			}
+
+			offsets = append(offsets, offset+int64(i+idx))
+			if offsets[0] == 0 {
+				// If signature is at the beginning, return immediately, it's a regular archive
+				return offsets, nil
+			}
+
+			i += idx + 1
+		}
+
+		if err != nil {
+			if errors.Is(err, io.EOF) {
+				break
+			}
+
+			return nil, fmt.Errorf("sevenzip: error reading chunk: %w", err)
+		}
+	}
+
+	return offsets, nil
+}
+
+//nolint:cyclop,funlen,gocognit,gocyclo,maintidx
+func (z *Reader) init(r io.ReaderAt, size int64) (err error) {
+	h := crc32.NewIEEE()
+	tra := plumbing.TeeReaderAt(r, h)
+
+	var (
+		signature = []byte{'7', 'z', 0xbc, 0xaf, 0x27, 0x1c}
+		offsets   []int64
+	)
+
+	offsets, err = findSignature(r, signature)
+	if err != nil {
+		return err
+	}
+
+	if len(offsets) == 0 {
+		return errFormat
+	}
+
+	var (
+		sr    *io.SectionReader
+		off   int64
+		start startHeader
+	)
+
+	for _, off = range offsets {
+		sr = io.NewSectionReader(tra, off, size-off) // Will only read first 32 bytes
+
+		var sh signatureHeader
+		if err = binary.Read(sr, binary.LittleEndian, &sh); err != nil {
+			return fmt.Errorf("sevenzip: error reading signature header: %w", err)
+		}
+
+		z.r = r
+
+		h.Reset()
+
+		if err = binary.Read(sr, binary.LittleEndian, &start); err != nil {
+			return fmt.Errorf("sevenzip: error reading start header: %w", err)
+		}
+
+		// CRC of the start header should match
+		if util.CRC32Equal(h.Sum(nil), sh.CRC) {
+			break
+		}
+
+		err = errChecksum
+	}
+
+	if err != nil {
+		return err
+	}
+
+	// Work out where we are in the file (32, avoiding magic numbers)
+	if z.start, err = sr.Seek(0, io.SeekCurrent); err != nil {
+		return fmt.Errorf("sevenzip: error seeking current position: %w", err)
+	}
+
+	// Seek over the streams
+	if z.end, err = sr.Seek(int64(start.Offset), io.SeekCurrent); err != nil { //nolint:gosec
+		return fmt.Errorf("sevenzip: error seeking over streams: %w", err)
+	}
+
+	z.start += off
+	z.end += off
+
+	h.Reset()
+
+	// Bound bufio.Reader otherwise it can read trailing garbage which screws up the CRC check
+	br := bufio.NewReader(io.NewSectionReader(tra, z.end, int64(start.Size))) //nolint:gosec
+
+	var (
+		id          byte
+		header      *header
+		streamsInfo *streamsInfo
+	)
+
+	if id, err = br.ReadByte(); err != nil {
+		return fmt.Errorf("sevenzip: error reading header id: %w", err)
+	}
+
+	switch id {
+	case idHeader:
+		if header, err = readHeader(br); err != nil {
+			return err
+		}
+	case idEncodedHeader:
+		if streamsInfo, err = readStreamsInfo(br); err != nil {
+			return err
+		}
+	default:
+		return errUnexpectedID
+	}
+
+	// If there's more data to read, we've not parsed this correctly. This
+	// won't break with trailing data as the bufio.Reader was bounded
+	if n, _ := io.CopyN(io.Discard, br, 1); n != 0 {
+		return errTooMuch
+	}
+
+	// CRC should match the one from the start header
+	if !util.CRC32Equal(h.Sum(nil), start.CRC) {
+		return errChecksum
+	}
+
+	// If the header was encoded we should have sufficient information now
+	// to decode it
+	if streamsInfo != nil {
+		if streamsInfo.Folders() != 1 {
+			return errOneHeaderStream
+		}
+
+		var (
+			fr        *folderReadCloser
+			crc       uint32
+			encrypted bool
+		)
+
+		fr, crc, encrypted, err = z.folderReader(streamsInfo, 0)
+		if err != nil {
+			return &ReadError{
+				Encrypted: encrypted,
+				Err:       err,
+			}
+		}
+
+		defer func() {
+			err = multierror.Append(err, fr.Close()).ErrorOrNil()
+		}()
+
+		if header, err = readEncodedHeader(util.ByteReadCloser(fr)); err != nil {
+			return &ReadError{
+				Encrypted: fr.hasEncryption,
+				Err:       err,
+			}
+		}
+
+		if crc != 0 && !util.CRC32Equal(fr.Checksum(), crc) {
+			return errChecksum
+		}
+	}
+
+	z.si = header.streamsInfo
+
+	// spew.Dump(header)
+	filesPerStream := make(map[int]int, z.si.Folders())
+
+	if header.filesInfo != nil {
+		folder, offset := 0, int64(0)
+		z.File = make([]*File, 0, len(header.filesInfo.file))
+		j := 0
+
+		for _, fh := range header.filesInfo.file {
+			f := new(File)
+			f.zip = z
+			f.FileHeader = fh
+
+			if f.FileHeader.FileInfo().IsDir() && !strings.HasSuffix(f.FileHeader.Name, "/") {
+				f.FileHeader.Name += "/"
+			}
+
+			if !fh.isEmptyStream && !fh.isEmptyFile {
+				f.folder, _ = header.streamsInfo.FileFolderAndSize(j)
+
+				// Make an exported copy of the folder index
+				f.Stream = f.folder
+
+				filesPerStream[f.folder]++
+
+				if f.folder != folder {
+					offset = 0
+				}
+
+				f.offset = offset
+				offset += int64(f.UncompressedSize) //nolint:gosec
+				folder = f.folder
+				j++
+			}
+
+			z.File = append(z.File, f)
+		}
+	}
+
+	// spew.Dump(filesPerStream)
+
+	z.pool = make([]pool.Pooler, z.si.Folders())
+	for i := range z.pool {
+		var newPool pool.Constructor = pool.NewNoopPool
+
+		if filesPerStream[i] > 1 {
+			newPool = pool.NewPool
+		}
+
+		if z.pool[i], err = newPool(); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// Volumes returns the list of volumes that have been opened as part of the
+// current archive.
+func (rc *ReadCloser) Volumes() []string {
+	volumes := make([]string, len(rc.f))
+	for idx, f := range rc.f {
+		volumes[idx] = f.Name()
+	}
+
+	return volumes
+}
+
+// Close closes the 7-zip file or volumes, rendering them unusable for I/O.
+func (rc *ReadCloser) Close() (err error) {
+	for _, f := range rc.f {
+		err = multierror.Append(err, f.Close()).ErrorOrNil()
+	}
+
+	if err != nil {
+		err = fmt.Errorf("sevenzip: error closing: %w", err)
+	}
+
+	return err
+}
+
+type fileListEntry struct {
+	name  string
+	file  *File
+	isDir bool
+	isDup bool
+}
+
+type fileInfoDirEntry interface {
+	fs.FileInfo
+	fs.DirEntry
+}
+
+func (e *fileListEntry) stat() (fileInfoDirEntry, error) {
+	if e.isDup {
+		return nil, errors.New(e.name + ": duplicate entries in 7-zip file") //nolint:err113
+	}
+
+	if !e.isDir {
+		return headerFileInfo{&e.file.FileHeader}, nil
+	}
+
+	return e, nil
+}
+
+func (e *fileListEntry) Name() string {
+	_, elem := split(e.name)
+
+	return elem
+}
+
+func (e *fileListEntry) Size() int64       { return 0 }
+func (e *fileListEntry) Mode() fs.FileMode { return fs.ModeDir | 0o555 }
+func (e *fileListEntry) Type() fs.FileMode { return fs.ModeDir }
+func (e *fileListEntry) IsDir() bool       { return true }
+func (e *fileListEntry) Sys() interface{}  { return nil }
+
+func (e *fileListEntry) ModTime() time.Time {
+	if e.file == nil {
+		return time.Time{}
+	}
+
+	return e.file.FileHeader.Modified.UTC()
+}
+
+func (e *fileListEntry) Info() (fs.FileInfo, error) { return e, nil }
+
+func toValidName(name string) string {
+	name = strings.ReplaceAll(name, `\`, `/`)
+
+	p := strings.TrimPrefix(path.Clean(name), "/")
+
+	for strings.HasPrefix(p, "../") {
+		p = p[len("../"):]
+	}
+
+	return p
+}
+
+//nolint:cyclop,funlen
+func (z *Reader) initFileList() {
+	z.fileListOnce.Do(func() {
+		files := make(map[string]int)
+		knownDirs := make(map[string]int)
+
+		dirs := make(map[string]struct{})
+
+		for _, file := range z.File {
+			isDir := len(file.Name) > 0 && file.Name[len(file.Name)-1] == '/'
+
+			name := toValidName(file.Name)
+			if name == "" {
+				continue
+			}
+
+			if idx, ok := files[name]; ok {
+				z.fileList[idx].isDup = true
+
+				continue
+			}
+
+			if idx, ok := knownDirs[name]; ok {
+				z.fileList[idx].isDup = true
+
+				continue
+			}
+
+			for dir := path.Dir(name); dir != "."; dir = path.Dir(dir) {
+				dirs[dir] = struct{}{}
+			}
+
+			idx := len(z.fileList)
+			entry := fileListEntry{
+				name:  name,
+				file:  file,
+				isDir: isDir,
+			}
+			z.fileList = append(z.fileList, entry)
+
+			if isDir {
+				knownDirs[name] = idx
+			} else {
+				files[name] = idx
+			}
+		}
+
+		for dir := range dirs {
+			if _, ok := knownDirs[dir]; !ok {
+				if idx, ok := files[dir]; ok {
+					z.fileList[idx].isDup = true
+				} else {
+					entry := fileListEntry{
+						name:  dir,
+						file:  nil,
+						isDir: true,
+					}
+					z.fileList = append(z.fileList, entry)
+				}
+			}
+		}
+
+		sort.Slice(z.fileList, func(i, j int) bool { return fileEntryLess(z.fileList[i].name, z.fileList[j].name) })
+	})
+}
+
+func fileEntryLess(x, y string) bool {
+	xdir, xelem := split(x)
+	ydir, yelem := split(y)
+
+	return xdir < ydir || xdir == ydir && xelem < yelem
+}
+
+// Open opens the named file in the 7-zip archive, using the semantics of
+// [fs.FS.Open]: paths are always slash separated, with no leading / or ../
+// elements.
+func (z *Reader) Open(name string) (fs.File, error) {
+	z.initFileList()
+
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "open", Path: name, Err: fs.ErrInvalid}
+	}
+
+	e := z.openLookup(name)
+	if e == nil {
+		return nil, &fs.PathError{Op: "open", Path: name, Err: fs.ErrNotExist}
+	}
+
+	if e.isDir {
+		return &openDir{e, z.openReadDir(name), 0}, nil
+	}
+
+	rc, err := e.file.Open()
+	if err != nil {
+		return nil, err
+	}
+
+	return rc.(fs.File), nil //nolint:forcetypeassert
+}
+
+func split(name string) (dir, elem string) {
+	if len(name) > 0 && name[len(name)-1] == '/' {
+		name = name[:len(name)-1]
+	}
+
+	i := len(name) - 1
+	for i >= 0 && name[i] != '/' {
+		i--
+	}
+
+	if i < 0 {
+		return ".", name
+	}
+
+	return name[:i], name[i+1:]
+}
+
+//nolint:gochecknoglobals
+var dotFile = &fileListEntry{name: "./", isDir: true}
+
+func (z *Reader) openLookup(name string) *fileListEntry {
+	if name == "." {
+		return dotFile
+	}
+
+	dir, elem := split(name)
+
+	files := z.fileList
+	i := sort.Search(len(files), func(i int) bool {
+		idir, ielem := split(files[i].name)
+
+		return idir > dir || idir == dir && ielem >= elem
+	})
+
+	if i < len(files) {
+		fname := files[i].name
+		if fname == name || len(fname) == len(name)+1 && fname[len(name)] == '/' && fname[:len(name)] == name {
+			return &files[i]
+		}
+	}
+
+	return nil
+}
+
+func (z *Reader) openReadDir(dir string) []fileListEntry {
+	files := z.fileList
+
+	i := sort.Search(len(files), func(i int) bool {
+		idir, _ := split(files[i].name)
+
+		return idir >= dir
+	})
+
+	j := sort.Search(len(files), func(j int) bool {
+		jdir, _ := split(files[j].name)
+
+		return jdir > dir
+	})
+
+	return files[i:j]
+}
+
+type openDir struct {
+	e      *fileListEntry
+	files  []fileListEntry
+	offset int
+}
+
+func (d *openDir) Close() error               { return nil }
+func (d *openDir) Stat() (fs.FileInfo, error) { return d.e.stat() }
+
+var errIsDirectory = errors.New("is a directory")
+
+func (d *openDir) Read([]byte) (int, error) {
+	return 0, &fs.PathError{Op: "read", Path: d.e.name, Err: errIsDirectory}
+}
+
+func (d *openDir) ReadDir(count int) ([]fs.DirEntry, error) {
+	n := len(d.files) - d.offset
+	if count > 0 && n > count {
+		n = count
+	}
+
+	if n == 0 {
+		if count <= 0 {
+			return nil, nil
+		}
+
+		return nil, io.EOF
+	}
+
+	list := make([]fs.DirEntry, n)
+	for i := range list {
+		s, err := d.files[d.offset+i].stat()
+		if err != nil {
+			return nil, err
+		}
+
+		list[i] = s
+	}
+
+	d.offset += n
+
+	return list, nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/register.go b/vendor/github.com/bodgit/sevenzip/register.go
new file mode 100644
index 0000000000..4cb8abc5b2
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/register.go
@@ -0,0 +1,94 @@
+package sevenzip
+
+import (
+	"errors"
+	"io"
+	"sync"
+
+	"github.com/bodgit/sevenzip/internal/aes7z"
+	"github.com/bodgit/sevenzip/internal/bcj2"
+	"github.com/bodgit/sevenzip/internal/bra"
+	"github.com/bodgit/sevenzip/internal/brotli"
+	"github.com/bodgit/sevenzip/internal/bzip2"
+	"github.com/bodgit/sevenzip/internal/deflate"
+	"github.com/bodgit/sevenzip/internal/delta"
+	"github.com/bodgit/sevenzip/internal/lz4"
+	"github.com/bodgit/sevenzip/internal/lzma"
+	"github.com/bodgit/sevenzip/internal/lzma2"
+	"github.com/bodgit/sevenzip/internal/zstd"
+)
+
+// Decompressor describes the function signature that decompression/decryption
+// methods must implement to return a new instance of themselves. They are
+// passed any property bytes, the size of the stream and a slice of at least
+// one io.ReadCloser's providing the stream(s) of bytes.
+type Decompressor func([]byte, uint64, []io.ReadCloser) (io.ReadCloser, error)
+
+var (
+	//nolint:gochecknoglobals
+	decompressors sync.Map
+
+	errNeedOneReader = errors.New("copy: need exactly one reader")
+)
+
+func newCopyReader(_ []byte, _ uint64, readers []io.ReadCloser) (io.ReadCloser, error) {
+	if len(readers) != 1 {
+		return nil, errNeedOneReader
+	}
+	// just return the passed io.ReadCloser)
+	return readers[0], nil
+}
+
+//nolint:gochecknoinits
+func init() {
+	// Copy
+	RegisterDecompressor([]byte{0x00}, Decompressor(newCopyReader))
+	// Delta
+	RegisterDecompressor([]byte{0x03}, Decompressor(delta.NewReader))
+	// LZMA
+	RegisterDecompressor([]byte{0x03, 0x01, 0x01}, Decompressor(lzma.NewReader))
+	// BCJ
+	RegisterDecompressor([]byte{0x03, 0x03, 0x01, 0x03}, Decompressor(bra.NewBCJReader))
+	// BCJ2
+	RegisterDecompressor([]byte{0x03, 0x03, 0x01, 0x1b}, Decompressor(bcj2.NewReader))
+	// PPC
+	RegisterDecompressor([]byte{0x03, 0x03, 0x02, 0x05}, Decompressor(bra.NewPPCReader))
+	// ARM
+	RegisterDecompressor([]byte{0x03, 0x03, 0x05, 0x01}, Decompressor(bra.NewARMReader))
+	// SPARC
+	RegisterDecompressor([]byte{0x03, 0x03, 0x08, 0x05}, Decompressor(bra.NewSPARCReader))
+	// Deflate
+	RegisterDecompressor([]byte{0x04, 0x01, 0x08}, Decompressor(deflate.NewReader))
+	// Bzip2
+	RegisterDecompressor([]byte{0x04, 0x02, 0x02}, Decompressor(bzip2.NewReader))
+	// Zstandard
+	RegisterDecompressor([]byte{0x04, 0xf7, 0x11, 0x01}, Decompressor(zstd.NewReader))
+	// Brotli
+	RegisterDecompressor([]byte{0x04, 0xf7, 0x11, 0x02}, Decompressor(brotli.NewReader))
+	// LZ4
+	RegisterDecompressor([]byte{0x04, 0xf7, 0x11, 0x04}, Decompressor(lz4.NewReader))
+	// AES-CBC-256 & SHA-256
+	RegisterDecompressor([]byte{0x06, 0xf1, 0x07, 0x01}, Decompressor(aes7z.NewReader))
+	// LZMA2
+	RegisterDecompressor([]byte{0x21}, Decompressor(lzma2.NewReader))
+}
+
+// RegisterDecompressor allows custom decompressors for a specified method ID.
+func RegisterDecompressor(method []byte, dcomp Decompressor) {
+	if _, dup := decompressors.LoadOrStore(string(method), dcomp); dup {
+		panic("decompressor already registered")
+	}
+}
+
+func decompressor(method []byte) Decompressor {
+	di, ok := decompressors.Load(string(method))
+	if !ok {
+		return nil
+	}
+
+	if d, ok := di.(Decompressor); ok {
+		return d
+	}
+
+	return nil
+}
diff --git a/vendor/github.com/bodgit/sevenzip/release-please-config.json b/vendor/github.com/bodgit/sevenzip/release-please-config.json
new file mode 100644
index 0000000000..cb96721271
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/release-please-config.json
@@ -0,0 +1,6 @@
+{
+  "packages": {
+    ".": {}
+  },
+  "release-type": "go"
+}
diff --git a/vendor/github.com/bodgit/sevenzip/struct.go b/vendor/github.com/bodgit/sevenzip/struct.go
new file mode 100644
index 0000000000..1ccebdfc93
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/struct.go
@@ -0,0 +1,458 @@
+package sevenzip
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"hash"
+	"hash/crc32"
+	"io"
+	"io/fs"
+	"path"
+	"time"
+
+	"github.com/bodgit/plumbing"
+	"github.com/bodgit/sevenzip/internal/util"
+)
+
+var (
+	errAlgorithm             = errors.New("sevenzip: unsupported compression algorithm")
+	errInvalidWhence         = errors.New("invalid whence")
+	errNegativeSeek          = errors.New("negative seek")
+	errSeekBackwards         = errors.New("cannot seek backwards")
+	errSeekEOF               = errors.New("cannot seek beyond EOF")
+	errMultipleOutputStreams = errors.New("more than one output stream")
+	errNoBoundStream         = errors.New("cannot find bound stream")
+	errNoUnboundStream       = errors.New("expecting one unbound output stream")
+)
+
+// CryptoReadCloser adds a Password method to decompressors.
+type CryptoReadCloser interface {
+	Password(password string) error
+}
+
+type signatureHeader struct {
+	Signature [6]byte
+	Major     byte
+	Minor     byte
+	CRC       uint32
+}
+
+type startHeader struct {
+	Offset uint64
+	Size   uint64
+	CRC    uint32
+}
+
+type packInfo struct {
+	position uint64
+	streams  uint64
+	size     []uint64
+	digest   []uint32
+}
+
+type coder struct {
+	id         []byte
+	in, out    uint64
+	properties []byte
+}
+
+type bindPair struct {
+	in, out uint64
+}
+
+type folder struct {
+	in, out       uint64
+	packedStreams uint64
+	coder         []*coder
+	bindPair      []*bindPair
+	size          []uint64
+	packed        []uint64
+}
+
+func (f *folder) findInBindPair(i uint64) *bindPair {
+	for _, v := range f.bindPair {
+		if v.in == i {
+			return v
+		}
+	}
+
+	return nil
+}
+
+func (f *folder) findOutBindPair(i uint64) *bindPair {
+	for _, v := range f.bindPair {
+		if v.out == i {
+			return v
+		}
+	}
+
+	return nil
+}
+
+func (f *folder) coderReader(readers []io.ReadCloser, coder uint64, password string) (io.ReadCloser, bool, error) {
+	dcomp := decompressor(f.coder[coder].id)
+	if dcomp == nil {
+		return nil, false, errAlgorithm
+	}
+
+	cr, err := dcomp(f.coder[coder].properties, f.size[coder], readers)
+	if err != nil {
+		return nil, false, err
+	}
+
+	crc, ok := cr.(CryptoReadCloser)
+	if ok {
+		if err = crc.Password(password); err != nil {
+			return nil, true, fmt.Errorf("sevenzip: error setting password: %w", err)
+		}
+	}
+
+	return plumbing.LimitReadCloser(cr, int64(f.size[coder])), ok, nil //nolint:gosec
+}
+
+type folderReadCloser struct {
+	io.ReadCloser
+	h             hash.Hash
+	wc            *plumbing.WriteCounter
+	size          int64
+	hasEncryption bool
+}
+
+func (rc *folderReadCloser) Checksum() []byte {
+	return rc.h.Sum(nil)
+}
+
+func (rc *folderReadCloser) Seek(offset int64, whence int) (int64, error) {
+	var newo int64
+
+	switch whence {
+	case io.SeekStart:
+		newo = offset
+	case io.SeekCurrent:
+		newo = int64(rc.wc.Count()) + offset //nolint:gosec
+	case io.SeekEnd:
+		newo = rc.Size() + offset
+	default:
+		return 0, errInvalidWhence
+	}
+
+	if newo < 0 {
+		return 0, errNegativeSeek
+	}
+
+	if uint64(newo) < rc.wc.Count() {
+		return 0, errSeekBackwards
+	}
+
+	if newo > rc.Size() {
+		return 0, errSeekEOF
+	}
+
+	if _, err := io.CopyN(io.Discard, rc, newo-int64(rc.wc.Count())); err != nil { //nolint:gosec
+		return 0, fmt.Errorf("sevenzip: error seeking: %w", err)
+	}
+
+	return newo, nil
+}
+
+func (rc *folderReadCloser) Size() int64 {
+	return rc.size
+}
+
+func newFolderReadCloser(rc io.ReadCloser, size int64, hasEncryption bool) *folderReadCloser {
+	nrc := new(folderReadCloser)
+	nrc.h = crc32.NewIEEE()
+	nrc.wc = new(plumbing.WriteCounter)
+	nrc.ReadCloser = plumbing.TeeReadCloser(rc, io.MultiWriter(nrc.h, nrc.wc))
+	nrc.size = size
+	nrc.hasEncryption = hasEncryption
+
+	return nrc
+}
+
+func (f *folder) unpackSize() uint64 {
+	if len(f.size) == 0 {
+		return 0
+	}
+
+	for i := len(f.size) - 1; i >= 0; i-- {
+		if f.findOutBindPair(uint64(i)) == nil {
+			return f.size[i]
+		}
+	}
+
+	return f.size[len(f.size)-1]
+}
+
+type unpackInfo struct {
+	folder []*folder
+	digest []uint32
+}
+
+type subStreamsInfo struct {
+	streams []uint64
+	size    []uint64
+	digest  []uint32
+}
+
+type streamsInfo struct {
+	packInfo       *packInfo
+	unpackInfo     *unpackInfo
+	subStreamsInfo *subStreamsInfo
+}
+
+func (si *streamsInfo) Folders() int {
+	if si != nil && si.unpackInfo != nil {
+		return len(si.unpackInfo.folder)
+	}
+
+	return 0
+}
+
+func (si *streamsInfo) FileFolderAndSize(file int) (int, uint64) {
+	total := uint64(0)
+
+	var (
+		folder  int
+		streams uint64 = 1
+	)
+
+	if si.subStreamsInfo != nil {
+		for folder, streams = range si.subStreamsInfo.streams {
+			total += streams
+			if uint64(file) < total { //nolint:gosec
+				break
+			}
+		}
+	}
+
+	if streams == 1 {
+		return folder, si.unpackInfo.folder[folder].size[len(si.unpackInfo.folder[folder].coder)-1]
+	}
+
+	return folder, si.subStreamsInfo.size[file]
+}
+
+func (si *streamsInfo) folderOffset(folder int) int64 {
+	offset := uint64(0)
+
+	for i, k := 0, uint64(0); i < folder; i++ {
+		for j := k; j < k+si.unpackInfo.folder[i].packedStreams; j++ {
+			offset += si.packInfo.size[j]
+		}
+
+		k += si.unpackInfo.folder[i].packedStreams
+	}
+
+	return int64(si.packInfo.position + offset) //nolint:gosec
+}
+
+//nolint:cyclop,funlen,lll
+func (si *streamsInfo) FolderReader(r io.ReaderAt, folder int, password string) (*folderReadCloser, uint32, bool, error) {
+	f := si.unpackInfo.folder[folder]
+	in := make([]io.ReadCloser, f.in)
+	out := make([]io.ReadCloser, f.out)
+
+	packedOffset := 0
+	for i := 0; i < folder; i++ {
+		packedOffset += len(si.unpackInfo.folder[i].packed)
+	}
+
+	offset := int64(0)
+
+	for i, input := range f.packed {
+		size := int64(si.packInfo.size[packedOffset+i]) //nolint:gosec
+		in[input] = util.NopCloser(bufio.NewReader(io.NewSectionReader(r, si.folderOffset(folder)+offset, size)))
+		offset += size
+	}
+
+	var (
+		hasEncryption bool
+		input, output uint64
+	)
+
+	for i, c := range f.coder {
+		if c.out != 1 {
+			return nil, 0, hasEncryption, errMultipleOutputStreams
+		}
+
+		for j := input; j < input+c.in; j++ {
+			if in[j] != nil {
+				continue
+			}
+
+			bp := f.findInBindPair(j)
+			if bp == nil || out[bp.out] == nil {
+				return nil, 0, hasEncryption, errNoBoundStream
+			}
+
+			in[j] = out[bp.out]
+		}
+
+		var (
+			isEncrypted bool
+			err         error
+		)
+
+		out[output], isEncrypted, err = f.coderReader(in[input:input+c.in], uint64(i), password) //nolint:gosec
+		if err != nil {
+			return nil, 0, hasEncryption, err
+		}
+
+		if isEncrypted {
+			hasEncryption = true
+		}
+
+		input += c.in
+		output += c.out
+	}
+
+	unbound := make([]uint64, 0, f.out)
+
+	for i := uint64(0); i < f.out; i++ {
+		if bp := f.findOutBindPair(i); bp == nil {
+			unbound = append(unbound, i)
+		}
+	}
+
+	if len(unbound) != 1 || out[unbound[0]] == nil {
+		return nil, 0, hasEncryption, errNoUnboundStream
+	}
+
+	fr := newFolderReadCloser(out[unbound[0]], int64(f.unpackSize()), hasEncryption) //nolint:gosec
+
+	if si.unpackInfo.digest != nil {
+		return fr, si.unpackInfo.digest[folder], hasEncryption, nil
+	}
+
+	return fr, 0, hasEncryption, nil
+}
+
+type filesInfo struct {
+	file []FileHeader
+}
+
+type header struct {
+	streamsInfo *streamsInfo
+	filesInfo   *filesInfo
+}
+
+// FileHeader describes a file within a 7-zip file.
+type FileHeader struct {
+	Name             string
+	Created          time.Time
+	Accessed         time.Time
+	Modified         time.Time
+	Attributes       uint32
+	CRC32            uint32
+	UncompressedSize uint64
+
+	// Stream is an opaque identifier representing the compressed stream
+	// that contains the file. Any File with the same value can be assumed
+	// to be stored within the same stream.
+	Stream int
+
+	isEmptyStream bool
+	isEmptyFile   bool
+}
+
+// FileInfo returns an fs.FileInfo for the FileHeader.
+func (h *FileHeader) FileInfo() fs.FileInfo {
+	return headerFileInfo{h}
+}
+
+type headerFileInfo struct {
+	fh *FileHeader
+}
+
+func (fi headerFileInfo) Name() string       { return path.Base(fi.fh.Name) }
+func (fi headerFileInfo) Size() int64        { return int64(fi.fh.UncompressedSize) } //nolint:gosec
+func (fi headerFileInfo) IsDir() bool        { return fi.Mode().IsDir() }
+func (fi headerFileInfo) ModTime() time.Time { return fi.fh.Modified.UTC() }
+func (fi headerFileInfo) Mode() fs.FileMode  { return fi.fh.Mode() }
+func (fi headerFileInfo) Type() fs.FileMode  { return fi.fh.Mode().Type() }
+func (fi headerFileInfo) Sys() interface{}   { return fi.fh }
+
+func (fi headerFileInfo) Info() (fs.FileInfo, error) { return fi, nil }
+
+const (
+	// Unix constants. The specification doesn't mention them,
+	// but these seem to be the values agreed on by tools.
+	sIFMT   = 0xf000
+	sIFSOCK = 0xc000
+	sIFLNK  = 0xa000
+	sIFREG  = 0x8000
+	sIFBLK  = 0x6000
+	sIFDIR  = 0x4000
+	sIFCHR  = 0x2000
+	sIFIFO  = 0x1000
+	sISUID  = 0x800
+	sISGID  = 0x400
+	sISVTX  = 0x200
+
+	msdosDir      = 0x10
+	msdosReadOnly = 0x01
+)
+
+// Mode returns the permission and mode bits for the FileHeader.
+func (h *FileHeader) Mode() (mode fs.FileMode) {
+	// Prefer the POSIX attributes if they're present
+	if h.Attributes&0xf0000000 != 0 {
+		mode = unixModeToFileMode(h.Attributes >> 16)
+	} else {
+		mode = msdosModeToFileMode(h.Attributes)
+	}
+
+	return
+}
+
+func msdosModeToFileMode(m uint32) (mode fs.FileMode) {
+	if m&msdosDir != 0 {
+		mode = fs.ModeDir | 0o777
+	} else {
+		mode = 0o666
+	}
+
+	if m&msdosReadOnly != 0 {
+		mode &^= 0o222
+	}
+
+	return mode
+}
+
+//nolint:cyclop
+func unixModeToFileMode(m uint32) fs.FileMode {
+	mode := fs.FileMode(m & 0o777)
+
+	switch m & sIFMT {
+	case sIFBLK:
+		mode |= fs.ModeDevice
+	case sIFCHR:
+		mode |= fs.ModeDevice | fs.ModeCharDevice
+	case sIFDIR:
+		mode |= fs.ModeDir
+	case sIFIFO:
+		mode |= fs.ModeNamedPipe
+	case sIFLNK:
+		mode |= fs.ModeSymlink
+	case sIFREG:
+		// nothing to do
+	case sIFSOCK:
+		mode |= fs.ModeSocket
+	}
+
+	if m&sISGID != 0 {
+		mode |= fs.ModeSetgid
+	}
+
+	if m&sISUID != 0 {
+		mode |= fs.ModeSetuid
+	}
+
+	if m&sISVTX != 0 {
+		mode |= fs.ModeSticky
+	}
+
+	return mode
+}
diff --git a/vendor/github.com/bodgit/sevenzip/types.go b/vendor/github.com/bodgit/sevenzip/types.go
new file mode 100644
index 0000000000..57583ee3f7
--- /dev/null
+++ b/vendor/github.com/bodgit/sevenzip/types.go
@@ -0,0 +1,872 @@
+package sevenzip
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"math/bits"
+	"time"
+
+	"github.com/bodgit/sevenzip/internal/util"
+	"github.com/bodgit/windows"
+	"golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/transform"
+)
+
+const (
+	idEnd = iota
+	idHeader
+	idArchiveProperties
+	idAdditionalStreamsInfo
+	idMainStreamsInfo
+	idFilesInfo
+	idPackInfo
+	idUnpackInfo
+	idSubStreamsInfo
+	idSize
+	idCRC
+	idFolder
+	idCodersUnpackSize
+	idNumUnpackStream
+	idEmptyStream
+	idEmptyFile
+	idAnti //nolint:deadcode,varcheck
+	idName
+	idCTime
+	idATime
+	idMTime
+	idWinAttributes
+	idComment //nolint:deadcode,varcheck
+	idEncodedHeader
+	idStartPos
+	idDummy
+)
+
+var (
+	errIncompleteRead         = errors.New("sevenzip: incomplete read")
+	errUnexpectedID           = errors.New("sevenzip: unexpected id")
+	errMissingUnpackInfo      = errors.New("sevenzip: missing unpack info")
+	errWrongNumberOfFilenames = errors.New("sevenzip: wrong number of filenames")
+)
+
+func readUint64(r io.ByteReader) (uint64, error) {
+	b, err := r.ReadByte()
+	if err != nil {
+		return 0, fmt.Errorf("readUint64: ReadByte error: %w", err)
+	}
+
+	l := bits.LeadingZeros8(^b)
+
+	var v uint64
+	if l < 7 {
+		v |= uint64(b&((1<<(8-l))-1)) << (8 * l)
+	}
+
+	for i := 0; i < l; i++ {
+		b, err := r.ReadByte()
+		if err != nil {
+			return 0, fmt.Errorf("readUint64: ReadByte error: %w", err)
+		}
+
+		v |= uint64(b) << (8 * i)
+	}
+
+	return v, nil
+}
+
+func readBool(r io.ByteReader, count uint64) ([]bool, error) {
+	defined := make([]bool, count)
+
+	var b, mask byte
+	for i := range defined {
+		if mask == 0 {
+			var err error
+
+			b, err = r.ReadByte()
+			if err != nil {
+				return nil, fmt.Errorf("readBool: ReadByte error: %w", err)
+			}
+
+			mask = 0x80
+		}
+
+		defined[i] = (b & mask) != 0
+		mask >>= 1
+	}
+
+	return defined, nil
+}
+
+func readOptionalBool(r io.ByteReader, count uint64) ([]bool, error) {
+	all, err := r.ReadByte()
+	if err != nil {
+		return nil, fmt.Errorf("readOptionalBool: ReadByte error: %w", err)
+	}
+
+	if all == 0 {
+		return readBool(r, count)
+	}
+
+	defined := make([]bool, count)
+	for i := range defined {
+		defined[i] = true
+	}
+
+	return defined, nil
+}
+
+func readSizes(r io.ByteReader, count uint64) ([]uint64, error) {
+	sizes := make([]uint64, count)
+
+	for i := uint64(0); i < count; i++ {
+		size, err := readUint64(r)
+		if err != nil {
+			return nil, err
+		}
+
+		sizes[i] = size
+	}
+
+	return sizes, nil
+}
+
+func readCRC(r util.Reader, count uint64) ([]uint32, error) {
+	defined, err := readOptionalBool(r, count)
+	if err != nil {
+		return nil, err
+	}
+
+	crcs := make([]uint32, count)
+
+	for i := range defined {
+		if defined[i] {
+			if err := binary.Read(r, binary.LittleEndian, &crcs[i]); err != nil {
+				return nil, fmt.Errorf("readCRC: Read error: %w", err)
+			}
+		}
+	}
+
+	return crcs, nil
+}
+
+//nolint:cyclop
+func readPackInfo(r util.Reader) (*packInfo, error) {
+	p := new(packInfo)
+
+	var err error
+
+	p.position, err = readUint64(r)
+	if err != nil {
+		return nil, err
+	}
+
+	p.streams, err = readUint64(r)
+	if err != nil {
+		return nil, err
+	}
+
+	id, err := r.ReadByte()
+	if err != nil {
+		return nil, fmt.Errorf("readPackInfo: ReadByte error: %w", err)
+	}
+
+	if id == idSize {
+		if p.size, err = readSizes(r, p.streams); err != nil {
+			return nil, err
+		}
+
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readPackInfo: ReadByte error: %w", err)
+		}
+	}
+
+	if id == idCRC {
+		if p.digest, err = readCRC(r, p.streams); err != nil {
+			return nil, err
+		}
+
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readPackInfo: ReadByte error: %w", err)
+		}
+	}
+
+	if id != idEnd {
+		return nil, errUnexpectedID
+	}
+
+	return p, nil
+}
+
+//nolint:cyclop
+func readCoder(r util.Reader) (*coder, error) {
+	c := new(coder)
+
+	v, err := r.ReadByte()
+	if err != nil {
+		return nil, fmt.Errorf("readCoder: ReadByte error: %w", err)
+	}
+
+	c.id = make([]byte, v&0xf)
+	if n, err := r.Read(c.id); err != nil || n != int(v&0xf) {
+		if err != nil {
+			return nil, fmt.Errorf("readCoder: Read error: %w", err)
+		}
+
+		return nil, errIncompleteRead
+	}
+
+	if v&0x10 != 0 {
+		c.in, err = readUint64(r)
+		if err != nil {
+			return nil, err
+		}
+
+		c.out, err = readUint64(r)
+		if err != nil {
+			return nil, err
+		}
+	} else {
+		c.in, c.out = 1, 1
+	}
+
+	if v&0x20 != 0 {
+		size, err := readUint64(r)
+		if err != nil {
+			return nil, err
+		}
+
+		c.properties = make([]byte, size)
+		if n, err := r.Read(c.properties); err != nil || uint64(n) != size { //nolint:gosec
+			if err != nil {
+				return nil, fmt.Errorf("readCoder: Read error: %w", err)
+			}
+
+			return nil, errIncompleteRead
+		}
+	}
+
+	return c, nil
+}
+
+//nolint:cyclop
+func readFolder(r util.Reader) (*folder, error) {
+	f := new(folder)
+
+	coders, err := readUint64(r)
+	if err != nil {
+		return nil, err
+	}
+
+	f.coder = make([]*coder, coders)
+
+	for i := uint64(0); i < coders; i++ {
+		if f.coder[i], err = readCoder(r); err != nil {
+			return nil, err
+		}
+
+		f.in += f.coder[i].in
+		f.out += f.coder[i].out
+	}
+
+	bindPairs := f.out - 1
+
+	f.bindPair = make([]*bindPair, bindPairs)
+
+	for i := uint64(0); i < bindPairs; i++ {
+		in, err := readUint64(r)
+		if err != nil {
+			return nil, err
+		}
+
+		out, err := readUint64(r)
+		if err != nil {
+			return nil, err
+		}
+
+		f.bindPair[i] = &bindPair{
+			in:  in,
+			out: out,
+		}
+	}
+
+	f.packedStreams = f.in - bindPairs
+
+	if f.packedStreams == 1 {
+		f.packed = []uint64{}
+		for i := uint64(0); i < f.in; i++ {
+			if f.findInBindPair(i) == nil {
+				f.packed = append(f.packed, i)
+			}
+		}
+	} else {
+		f.packed = make([]uint64, f.packedStreams)
+		for i := uint64(0); i < f.packedStreams; i++ {
+			if f.packed[i], err = readUint64(r); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	return f, nil
+}
+
+//nolint:cyclop,funlen,gocognit
+func readUnpackInfo(r util.Reader) (*unpackInfo, error) {
+	u := new(unpackInfo)
+
+	if id, err := r.ReadByte(); err != nil || id != idFolder {
+		if err != nil {
+			return nil, fmt.Errorf("readUnpackInfo: ReadByte error: %w", err)
+		}
+
+		return nil, errUnexpectedID
+	}
+
+	folders, err := readUint64(r)
+	if err != nil {
+		return nil, err
+	}
+
+	external, err := r.ReadByte()
+	if err != nil {
+		return nil, fmt.Errorf("readUnpackInfo: ReadByte error: %w", err)
+	}
+
+	if external > 0 {
+		_, err := readUint64(r)
+		if err != nil {
+			return nil, err
+		}
+		// TODO Apparently we seek to this read offset and read the
+		// folder information from there. Not clear if the offset is
+		// absolute for the whole file, or relative to some known
+		// position in the file. Cowardly waiting for an example
+		return nil, errors.New("sevenzip: TODO readUnpackInfo external") //nolint:goerr113
+	}
+
+	u.folder = make([]*folder, folders)
+
+	for i := uint64(0); i < folders; i++ {
+		if u.folder[i], err = readFolder(r); err != nil {
+			return nil, err
+		}
+	}
+
+	if id, err := r.ReadByte(); err != nil || id != idCodersUnpackSize {
+		if err != nil {
+			return nil, fmt.Errorf("readUnpackInfo: ReadByte error: %w", err)
+		}
+
+		return nil, errUnexpectedID
+	}
+
+	for _, f := range u.folder {
+		total := uint64(0)
+		for _, c := range f.coder {
+			total += c.out
+		}
+
+		f.size = make([]uint64, total)
+		for i := range f.size {
+			if f.size[i], err = readUint64(r); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	id, err := r.ReadByte()
+	if err != nil {
+		return nil, fmt.Errorf("readUnpackInfo: ReadByte error: %w", err)
+	}
+
+	if id == idCRC {
+		if u.digest, err = readCRC(r, folders); err != nil {
+			return nil, err
+		}
+
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readUnpackInfo: ReadByte error: %w", err)
+		}
+	}
+
+	if id != idEnd {
+		return nil, errUnexpectedID
+	}
+
+	return u, nil
+}
+
+//nolint:cyclop,funlen
+func readSubStreamsInfo(r util.Reader, folder []*folder) (*subStreamsInfo, error) {
+	s := new(subStreamsInfo)
+
+	id, err := r.ReadByte()
+	if err != nil {
+		return nil, fmt.Errorf("readSubStreamsInfo: ReadByte error: %w", err)
+	}
+
+	s.streams = make([]uint64, len(folder))
+	if id == idNumUnpackStream {
+		for i := range s.streams {
+			if s.streams[i], err = readUint64(r); err != nil {
+				return nil, err
+			}
+		}
+
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readSubStreamsInfo: ReadByte error: %w", err)
+		}
+	} else {
+		for i := range s.streams {
+			s.streams[i] = 1
+		}
+	}
+
+	// Count the files in each stream
+	files := uint64(0)
+	for _, v := range s.streams {
+		files += v
+	}
+
+	if id == idSize {
+		s.size = make([]uint64, files)
+		k := 0
+
+		for i := range s.streams {
+			total := uint64(0)
+
+			for j := uint64(1); j < s.streams[i]; j++ {
+				if s.size[k], err = readUint64(r); err != nil {
+					return nil, err
+				}
+
+				total += s.size[k]
+				k++
+			}
+
+			s.size[k] = folder[i].unpackSize() - total
+			k++
+		}
+
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readSubStreamsInfo: ReadByte error: %w", err)
+		}
+	}
+
+	if id == idCRC {
+		if s.digest, err = readCRC(r, files); err != nil {
+			return nil, err
+		}
+
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readSubStreamsInfo: ReadByte error: %w", err)
+		}
+	}
+
+	if id != idEnd {
+		return nil, errUnexpectedID
+	}
+
+	return s, nil
+}
+
+//nolint:cyclop
+func readStreamsInfo(r util.Reader) (*streamsInfo, error) {
+	s := new(streamsInfo)
+
+	id, err := r.ReadByte()
+	if err != nil {
+		return nil, fmt.Errorf("readStreamsInfo: ReadByte error: %w", err)
+	}
+
+	if id == idPackInfo {
+		if s.packInfo, err = readPackInfo(r); err != nil {
+			return nil, err
+		}
+
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readStreamsInfo: ReadByte error: %w", err)
+		}
+	}
+
+	if id == idUnpackInfo {
+		if s.unpackInfo, err = readUnpackInfo(r); err != nil {
+			return nil, err
+		}
+
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readStreamsInfo: ReadByte error: %w", err)
+		}
+	}
+
+	if id == idSubStreamsInfo {
+		if s.unpackInfo == nil {
+			return nil, errMissingUnpackInfo
+		}
+
+		if s.subStreamsInfo, err = readSubStreamsInfo(r, s.unpackInfo.folder); err != nil {
+			return nil, err
+		}
+
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readStreamsInfo: ReadByte error: %w", err)
+		}
+	}
+
+	if id != idEnd {
+		return nil, errUnexpectedID
+	}
+
+	return s, nil
+}
+
+func readTimes(r util.Reader, count uint64) ([]time.Time, error) {
+	defined, err := readOptionalBool(r, count)
+	if err != nil {
+		return nil, err
+	}
+
+	external, err := r.ReadByte()
+	if err != nil {
+		return nil, fmt.Errorf("readTimes: ReadByte error: %w", err)
+	}
+
+	if external > 0 {
+		_, err := readUint64(r)
+		if err != nil {
+			return nil, err
+		}
+		// TODO Apparently we seek to this read offset and read the
+		// folder information from there. Not clear if the offset is
+		// absolute for the whole file, or relative to some known
+		// position in the file. Cowardly waiting for an example
+		return nil, errors.New("sevenzip: TODO readTimes external") //nolint:goerr113
+	}
+
+	times := make([]time.Time, count)
+
+	for i := range defined {
+		if defined[i] {
+			var ft windows.Filetime
+			if err := binary.Read(r, binary.LittleEndian, &ft); err != nil {
+				return nil, fmt.Errorf("readTimes: Read error: %w", err)
+			}
+
+			times[i] = time.Unix(0, ft.Nanoseconds()).UTC()
+		}
+	}
+
+	return times, nil
+}
+
+func splitNull(data []byte, atEOF bool) (advance int, token []byte, err error) {
+	if atEOF && len(data) == 0 {
+		return 0, nil, nil
+	}
+
+	if i := bytes.IndexRune(data, rune(0)); i >= 0 {
+		return i + 1, data[0:i], nil
+	}
+
+	if atEOF {
+		return len(data), data, nil
+	}
+
+	return
+}
+
+func readNames(r util.Reader, count, length uint64) ([]string, error) {
+	external, err := r.ReadByte()
+	if err != nil {
+		return nil, fmt.Errorf("readNames: ReadByte error: %w", err)
+	}
+
+	if external > 0 {
+		_, err := readUint64(r)
+		if err != nil {
+			return nil, err
+		}
+		// TODO Apparently we seek to this read offset and read the
+		// folder information from there. Not clear if the offset is
+		// absolute for the whole file, or relative to some known
+		// position in the file. Cowardly waiting for an example
+		return nil, errors.New("sevenzip: TODO readNames external") //nolint:goerr113
+	}
+
+	utf16le := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
+	scanner := bufio.NewScanner(transform.NewReader(io.LimitReader(r, int64(length-1)), utf16le.NewDecoder())) //nolint:gosec,lll
+	scanner.Split(splitNull)
+
+	names, i := make([]string, 0, count), uint64(0)
+	for scanner.Scan() {
+		names = append(names, scanner.Text())
+		i++
+	}
+
+	if err = scanner.Err(); err != nil {
+		return nil, fmt.Errorf("readNames: Scan error: %w", err)
+	}
+
+	if i != count {
+		return nil, errWrongNumberOfFilenames
+	}
+
+	return names, nil
+}
+
+func readAttributes(r util.Reader, count uint64) ([]uint32, error) {
+	defined, err := readOptionalBool(r, count)
+	if err != nil {
+		return nil, err
+	}
+
+	external, err := r.ReadByte()
+	if err != nil {
+		return nil, fmt.Errorf("readAttributes: ReadByte error: %w", err)
+	}
+
+	if external > 0 {
+		_, err := readUint64(r)
+		if err != nil {
+			return nil, err
+		}
+		// TODO Apparently we seek to this read offset and read the
+		// folder information from there. Not clear if the offset is
+		// absolute for the whole file, or relative to some known
+		// position in the file. Cowardly waiting for an example
+		return nil, errors.New("sevenzip: TODO readAttributes external") //nolint:goerr113
+	}
+
+	attributes := make([]uint32, count)
+
+	for i := range defined {
+		if defined[i] {
+			if err := binary.Read(r, binary.LittleEndian, &attributes[i]); err != nil {
+				return nil, fmt.Errorf("readAttributes: Read error: %w", err)
+			}
+		}
+	}
+
+	return attributes, nil
+}
+
+//nolint:cyclop,funlen,gocognit,gocyclo
+func readFilesInfo(r util.Reader) (*filesInfo, error) {
+	f := new(filesInfo)
+
+	files, err := readUint64(r)
+	if err != nil {
+		return nil, err
+	}
+
+	f.file = make([]FileHeader, files)
+
+	var emptyStreams uint64
+
+	for {
+		property, err := r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readFilesInfo: ReadByte error: %w", err)
+		}
+
+		if property == idEnd {
+			break
+		}
+
+		length, err := readUint64(r)
+		if err != nil {
+			return nil, err
+		}
+
+		switch property {
+		case idEmptyStream:
+			empty, err := readBool(r, files)
+			if err != nil {
+				return nil, err
+			}
+
+			for i := range f.file {
+				f.file[i].isEmptyStream = empty[i]
+
+				if empty[i] {
+					emptyStreams++
+				}
+			}
+		case idEmptyFile:
+			empty, err := readBool(r, emptyStreams)
+			if err != nil {
+				return nil, err
+			}
+
+			j := 0
+
+			for i := range f.file {
+				if f.file[i].isEmptyStream {
+					f.file[i].isEmptyFile = empty[j]
+					j++
+				}
+			}
+		case idCTime:
+			times, err := readTimes(r, files)
+			if err != nil {
+				return nil, err
+			}
+
+			for i, t := range times {
+				f.file[i].Created = t
+			}
+		case idATime:
+			times, err := readTimes(r, files)
+			if err != nil {
+				return nil, err
+			}
+
+			for i, t := range times {
+				f.file[i].Accessed = t
+			}
+		case idMTime:
+			times, err := readTimes(r, files)
+			if err != nil {
+				return nil, err
+			}
+
+			for i, t := range times {
+				f.file[i].Modified = t
+			}
+		case idName:
+			names, err := readNames(r, files, length)
+			if err != nil {
+				return nil, err
+			}
+
+			for i, n := range names {
+				f.file[i].Name = n
+			}
+		case idWinAttributes:
+			attributes, err := readAttributes(r, files)
+			if err != nil {
+				return nil, err
+			}
+
+			for i, a := range attributes {
+				f.file[i].Attributes = a
+			}
+		case idStartPos:
+			return nil, errors.New("sevenzip: TODO idStartPos") //nolint:goerr113
+		case idDummy:
+			if _, err := io.CopyN(io.Discard, r, int64(length)); err != nil { //nolint:gosec
+				return nil, fmt.Errorf("readFilesInfo: CopyN error: %w", err)
+			}
+		default:
+			return nil, errUnexpectedID
+		}
+	}
+
+	return f, nil
+}
+
+//nolint:cyclop,funlen
+func readHeader(r util.Reader) (*header, error) {
+	h := new(header)
+
+	id, err := r.ReadByte()
+	if err != nil {
+		return nil, fmt.Errorf("readHeader: ReadByte error: %w", err)
+	}
+
+	if id == idArchiveProperties {
+		return nil, errors.New("sevenzip: TODO idArchiveProperties") //nolint:goerr113,revive
+
+		//nolint:govet
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readHeader: ReadByte error: %w", err)
+		}
+	}
+
+	if id == idAdditionalStreamsInfo {
+		return nil, errors.New("sevenzip: TODO idAdditionalStreamsInfo") //nolint:goerr113,revive
+
+		//nolint:govet
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readHeader: ReadByte error: %w", err)
+		}
+	}
+
+	if id == idMainStreamsInfo {
+		if h.streamsInfo, err = readStreamsInfo(r); err != nil {
+			return nil, err
+		}
+
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readHeader: ReadByte error: %w", err)
+		}
+	}
+
+	if id == idFilesInfo {
+		if h.filesInfo, err = readFilesInfo(r); err != nil {
+			return nil, err
+		}
+
+		id, err = r.ReadByte()
+		if err != nil {
+			return nil, fmt.Errorf("readHeader: ReadByte error: %w", err)
+		}
+	}
+
+	if id != idEnd {
+		return nil, errUnexpectedID
+	}
+
+	if h.streamsInfo == nil || h.filesInfo == nil {
+		return h, nil
+	}
+
+	j := 0
+
+	for i := range h.filesInfo.file {
+		if h.filesInfo.file[i].isEmptyStream {
+			continue
+		}
+
+		if h.streamsInfo.subStreamsInfo != nil {
+			h.filesInfo.file[i].CRC32 = h.streamsInfo.subStreamsInfo.digest[j]
+		}
+
+		_, h.filesInfo.file[i].UncompressedSize = h.streamsInfo.FileFolderAndSize(j)
+		j++
+	}
+
+	return h, nil
+}
+
+func readEncodedHeader(r util.Reader) (*header, error) {
+	if id, err := r.ReadByte(); err != nil || id != idHeader {
+		if err != nil {
+			return nil, fmt.Errorf("readEncodedHeader: ReadByte error: %w", err)
+		}
+
+		return nil, errUnexpectedID
+	}
+
+	header, err := readHeader(r)
+	if err != nil {
+		return nil, err
+	}
+
+	return header, nil
+}
diff --git a/vendor/github.com/bodgit/windows/.golangci.yaml b/vendor/github.com/bodgit/windows/.golangci.yaml
new file mode 100644
index 0000000000..f74dd15446
--- /dev/null
+++ b/vendor/github.com/bodgit/windows/.golangci.yaml
@@ -0,0 +1,13 @@
+---
+linters:
+  enable-all: true
+  disable:
+    - exhaustivestruct
+    - exhaustruct
+    - godox
+    - goerr113
+    - gomnd
+    - ireturn
+    - nonamedreturns
+    - varnamelen
+    - wrapcheck
diff --git a/vendor/github.com/bodgit/windows/.goreleaser.yml b/vendor/github.com/bodgit/windows/.goreleaser.yml
new file mode 100644
index 0000000000..75e2a1f7e0
--- /dev/null
+++ b/vendor/github.com/bodgit/windows/.goreleaser.yml
@@ -0,0 +1,7 @@
+---
+builds:
+  - skip: true
+release:
+  prerelease: auto
+changelog:
+  use: github-native
diff --git a/vendor/github.com/bodgit/windows/LICENSE b/vendor/github.com/bodgit/windows/LICENSE
new file mode 100644
index 0000000000..08172a91ab
--- /dev/null
+++ b/vendor/github.com/bodgit/windows/LICENSE
@@ -0,0 +1,30 @@
+BSD 3-Clause License
+
+Copyright (c) 2020, Matt Dainty
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/vendor/github.com/bodgit/windows/README.md b/vendor/github.com/bodgit/windows/README.md
new file mode 100644
index 0000000000..1c3fb31a33
--- /dev/null
+++ b/vendor/github.com/bodgit/windows/README.md
@@ -0,0 +1,14 @@
+[![GitHub release](https://img.shields.io/github/v/release/bodgit/windows)](https://github.com/bodgit/windows/releases)
+[![Build Status](https://img.shields.io/github/workflow/status/bodgit/windows/build)](https://github.com/bodgit/windows/actions?query=workflow%3Abuild)
+[![Coverage Status](https://coveralls.io/repos/github/bodgit/windows/badge.svg?branch=main)](https://coveralls.io/github/bodgit/windows?branch=main)
+[![Go Report Card](https://goreportcard.com/badge/github.com/bodgit/windows)](https://goreportcard.com/report/github.com/bodgit/windows)
+[![GoDoc](https://godoc.org/github.com/bodgit/windows?status.svg)](https://godoc.org/github.com/bodgit/windows)
+![Go version](https://img.shields.io/badge/Go-1.18-brightgreen.svg)
+![Go version](https://img.shields.io/badge/Go-1.17-brightgreen.svg)
+
+windows
+=======
+
+A collection of types native to Windows but are useful on non-Windows platforms.
+
+The `FILETIME`-equivalent type is the sole export which is a 1:1 copy of the type found in the `golang.org/x/sys/windows` package. That package only builds on `GOOS=windows` and this particular type gets used in other protocols and file types such as NTLMv2 and 7-zip.
diff --git a/vendor/github.com/bodgit/windows/filetime.go b/vendor/github.com/bodgit/windows/filetime.go
new file mode 100644
index 0000000000..96e678d60c
--- /dev/null
+++ b/vendor/github.com/bodgit/windows/filetime.go
@@ -0,0 +1,44 @@
+// Package windows is a collection of types native to Windows platforms but
+// are useful on non-Windows platforms.
+package windows
+
+// Taken from golang.org/x/sys/windows
+
+const offset int64 = 116444736000000000
+
+// Filetime mirrors the Windows FILETIME structure which represents time
+// as the number of 100-nanosecond intervals that have elapsed since
+// 00:00:00 UTC, January 1, 1601. This code is taken from the
+// golang.org/x/sys/windows package where it's not available for non-Windows
+// platforms however various file formats and protocols pass this structure
+// about so it's useful to have it available for interoperability purposes.
+type Filetime struct {
+	LowDateTime  uint32
+	HighDateTime uint32
+}
+
+// Nanoseconds returns Filetime ft in nanoseconds
+// since Epoch (00:00:00 UTC, January 1, 1970).
+func (ft *Filetime) Nanoseconds() int64 {
+	// 100-nanosecond intervals since January 1, 1601
+	nsec := int64(ft.HighDateTime)<<32 + int64(ft.LowDateTime)
+	// change starting time to the Epoch (00:00:00 UTC, January 1, 1970)
+	nsec -= offset
+	// convert into nanoseconds
+	nsec *= 100
+
+	return nsec
+}
+
+// NsecToFiletime converts nanoseconds to the equivalent Filetime type.
+func NsecToFiletime(nsec int64) (ft Filetime) {
+	// convert into 100-nanosecond
+	nsec /= 100
+	// change starting time to January 1, 1601
+	nsec += offset
+	// split into high / low
+	ft.LowDateTime = uint32(nsec & 0xffffffff)
+	ft.HighDateTime = uint32(nsec >> 32 & 0xffffffff)
+
+	return ft
+}
diff --git a/vendor/github.com/dsnet/compress/bzip2/bwt.go b/vendor/github.com/dsnet/compress/bzip2/bwt.go
index 44a2541fe4..4ed2d74489 100644
--- a/vendor/github.com/dsnet/compress/bzip2/bwt.go
+++ b/vendor/github.com/dsnet/compress/bzip2/bwt.go
@@ -15,6 +15,7 @@ import "github.com/dsnet/compress/bzip2/internal/sais"
 // Transform, such that a SA can be converted to a BWT in O(n) time.
 //
 // References:
+//
 //	http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf
 //	https://github.com/cscott/compressjs/blob/master/lib/BWT.js
 //	https://www.quora.com/How-can-I-optimize-burrows-wheeler-transform-and-inverse-transform-to-work-in-O-n-time-O-n-space
diff --git a/vendor/github.com/dsnet/compress/bzip2/common.go b/vendor/github.com/dsnet/compress/bzip2/common.go
index c633981526..ae4c966e4e 100644
--- a/vendor/github.com/dsnet/compress/bzip2/common.go
+++ b/vendor/github.com/dsnet/compress/bzip2/common.go
@@ -5,9 +5,11 @@
 // Package bzip2 implements the BZip2 compressed data format.
 //
 // Canonical C implementation:
+//
 //	http://bzip.org
 //
 // Unofficial format specification:
+//
 //	https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf
 package bzip2
 
diff --git a/vendor/github.com/dsnet/compress/bzip2/fuzz_off.go b/vendor/github.com/dsnet/compress/bzip2/fuzz_off.go
index ddd32f5065..ec894e2eb4 100644
--- a/vendor/github.com/dsnet/compress/bzip2/fuzz_off.go
+++ b/vendor/github.com/dsnet/compress/bzip2/fuzz_off.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE.md file.
 
+//go:build !gofuzz
 // +build !gofuzz
 
 // This file exists to suppress fuzzing details from release builds.
diff --git a/vendor/github.com/dsnet/compress/bzip2/fuzz_on.go b/vendor/github.com/dsnet/compress/bzip2/fuzz_on.go
index 54122351c5..0bae7718f6 100644
--- a/vendor/github.com/dsnet/compress/bzip2/fuzz_on.go
+++ b/vendor/github.com/dsnet/compress/bzip2/fuzz_on.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE.md file.
 
+//go:build gofuzz
 // +build gofuzz
 
 // This file exists to export internal implementation details for fuzz testing.
diff --git a/vendor/github.com/dsnet/compress/bzip2/mtf_rle2.go b/vendor/github.com/dsnet/compress/bzip2/mtf_rle2.go
index 5c71b3431d..8f5c1ac967 100644
--- a/vendor/github.com/dsnet/compress/bzip2/mtf_rle2.go
+++ b/vendor/github.com/dsnet/compress/bzip2/mtf_rle2.go
@@ -14,6 +14,7 @@ import "github.com/dsnet/compress/internal/errors"
 // normal two's complement arithmetic. The methodology for doing so is below.
 //
 // Assuming the following:
+//
 //	num: The value being encoded by RLE encoding.
 //	run: A sequence of RUNA and RUNB symbols represented as a binary integer,
 //	where RUNA is the 0 bit, RUNB is the 1 bit, and least-significant RUN
@@ -21,6 +22,7 @@ import "github.com/dsnet/compress/internal/errors"
 //	cnt: The number of RUNA and RUNB symbols.
 //
 // Then the RLE encoding used by bzip2 has this mathematical property:
+//
 //	num+1 == (1<<cnt) | run
 type moveToFront struct {
 	dictBuf [256]uint8
diff --git a/vendor/github.com/dsnet/compress/bzip2/prefix.go b/vendor/github.com/dsnet/compress/bzip2/prefix.go
index 4847d80965..e80b483337 100644
--- a/vendor/github.com/dsnet/compress/bzip2/prefix.go
+++ b/vendor/github.com/dsnet/compress/bzip2/prefix.go
@@ -32,7 +32,6 @@ const (
 //	11110    <=> 4
 //	111110   <=> 5
 //	111111   <=> 6	Invalid tree index, so should fail
-//
 var encSel, decSel = func() (e prefix.Encoder, d prefix.Decoder) {
 	var selCodes [maxNumTrees + 1]prefix.PrefixCode
 	for i := range selCodes {
@@ -150,6 +149,7 @@ func (pw *prefixWriter) WritePrefixCodes(codes []prefix.PrefixCodes, trees []pre
 // handleDegenerateCodes converts a degenerate tree into a canonical tree.
 //
 // For example, when the input is an under-subscribed tree:
+//
 //	input:  []PrefixCode{
 //		{Sym: 0, Len: 3},
 //		{Sym: 1, Len: 4},
@@ -165,6 +165,7 @@ func (pw *prefixWriter) WritePrefixCodes(codes []prefix.PrefixCodes, trees []pre
 //	}
 //
 // For example, when the input is an over-subscribed tree:
+//
 //	input:  []PrefixCode{
 //		{Sym: 0, Len: 1},
 //		{Sym: 1, Len: 3},
diff --git a/vendor/github.com/dsnet/compress/bzip2/rle1.go b/vendor/github.com/dsnet/compress/bzip2/rle1.go
index 1d789f65f2..b96f0cfc9f 100644
--- a/vendor/github.com/dsnet/compress/bzip2/rle1.go
+++ b/vendor/github.com/dsnet/compress/bzip2/rle1.go
@@ -17,9 +17,11 @@ var rleDone = errorf(errors.Unknown, "RLE1 stage is completed")
 // run lengths of 256..259. The decoder can handle the latter case.
 //
 // For example, if the input was:
+//
 //	input:  "AAAAAAABBBBCCCD"
 //
 // Then the output will be:
+//
 //	output: "AAAA\x03BBBB\x00CCCD"
 type runLengthEncoding struct {
 	buf     []byte
diff --git a/vendor/github.com/dsnet/compress/internal/debug.go b/vendor/github.com/dsnet/compress/internal/debug.go
index 01df1f8953..92435377c3 100644
--- a/vendor/github.com/dsnet/compress/internal/debug.go
+++ b/vendor/github.com/dsnet/compress/internal/debug.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE.md file.
 
+//go:build debug && !gofuzz
 // +build debug,!gofuzz
 
 package internal
diff --git a/vendor/github.com/dsnet/compress/internal/errors/errors.go b/vendor/github.com/dsnet/compress/internal/errors/errors.go
index c631afbd62..daf3fe93cf 100644
--- a/vendor/github.com/dsnet/compress/internal/errors/errors.go
+++ b/vendor/github.com/dsnet/compress/internal/errors/errors.go
@@ -17,6 +17,7 @@
 // recover from errors only generated from within this repository.
 //
 // Example usage:
+//
 //	func Foo() (err error) {
 //		defer errors.Recover(&err)
 //
@@ -28,7 +29,6 @@
 //			errors.Panic(errors.New("whoopsie"))
 //		}
 //	}
-//
 package errors
 
 import "strings"
diff --git a/vendor/github.com/dsnet/compress/internal/gofuzz.go b/vendor/github.com/dsnet/compress/internal/gofuzz.go
index 5035c9d63f..38f44d0e19 100644
--- a/vendor/github.com/dsnet/compress/internal/gofuzz.go
+++ b/vendor/github.com/dsnet/compress/internal/gofuzz.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE.md file.
 
+//go:build gofuzz
 // +build gofuzz
 
 package internal
diff --git a/vendor/github.com/dsnet/compress/internal/prefix/debug.go b/vendor/github.com/dsnet/compress/internal/prefix/debug.go
index 04fce70bbb..2a1cb25a8c 100644
--- a/vendor/github.com/dsnet/compress/internal/prefix/debug.go
+++ b/vendor/github.com/dsnet/compress/internal/prefix/debug.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE.md file.
 
+//go:build debug
 // +build debug
 
 package prefix
diff --git a/vendor/github.com/dsnet/compress/internal/prefix/prefix.go b/vendor/github.com/dsnet/compress/internal/prefix/prefix.go
index c73e748e88..0c333f92de 100644
--- a/vendor/github.com/dsnet/compress/internal/prefix/prefix.go
+++ b/vendor/github.com/dsnet/compress/internal/prefix/prefix.go
@@ -91,8 +91,8 @@ func (pc PrefixCodes) checkPrefixes() bool {
 // checkCanonical reports whether all codes are canonical.
 // That is, they have the following properties:
 //
-//	1. All codes of a given bit-length are consecutive values.
-//	2. Shorter codes lexicographically precede longer codes.
+//  1. All codes of a given bit-length are consecutive values.
+//  2. Shorter codes lexicographically precede longer codes.
 //
 // The codes must have unique symbols and be sorted by the symbol
 // The Len and Val fields in each code must be populated.
diff --git a/vendor/github.com/dsnet/compress/internal/prefix/range.go b/vendor/github.com/dsnet/compress/internal/prefix/range.go
index b7eddad537..15ec9343f1 100644
--- a/vendor/github.com/dsnet/compress/internal/prefix/range.go
+++ b/vendor/github.com/dsnet/compress/internal/prefix/range.go
@@ -37,6 +37,7 @@ func (rcs RangeCodes) End() uint32 { return rcs[len(rcs)-1].End() }
 
 // checkValid reports whether the RangeCodes is valid. In order to be valid,
 // the following must hold true:
+//
 //	rcs[i-1].Base <= rcs[i].Base
 //	rcs[i-1].End  <= rcs[i].End
 //	rcs[i-1].End  >= rcs[i].Base
diff --git a/vendor/github.com/dsnet/compress/internal/release.go b/vendor/github.com/dsnet/compress/internal/release.go
index 0990be1c5a..2d25f2fa42 100644
--- a/vendor/github.com/dsnet/compress/internal/release.go
+++ b/vendor/github.com/dsnet/compress/internal/release.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE.md file.
 
+//go:build !debug && !gofuzz
 // +build !debug,!gofuzz
 
 package internal
diff --git a/vendor/github.com/golang/snappy/AUTHORS b/vendor/github.com/golang/snappy/AUTHORS
deleted file mode 100644
index 52ccb5a934..0000000000
--- a/vendor/github.com/golang/snappy/AUTHORS
+++ /dev/null
@@ -1,18 +0,0 @@
-# This is the official list of Snappy-Go authors for copyright purposes.
-# This file is distinct from the CONTRIBUTORS files.
-# See the latter for an explanation.
-
-# Names should be added to this file as
-#	Name or Organization <email address>
-# The email address is not required for organizations.
-
-# Please keep the list sorted.
-
-Amazon.com, Inc
-Damian Gryski <dgryski@gmail.com>
-Eric Buth <eric@topos.com>
-Google Inc.
-Jan Mercl <0xjnml@gmail.com>
-Klaus Post <klauspost@gmail.com>
-Rodolfo Carvalho <rhcarvalho@gmail.com>
-Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/golang/snappy/CONTRIBUTORS b/vendor/github.com/golang/snappy/CONTRIBUTORS
deleted file mode 100644
index ea6524ddd0..0000000000
--- a/vendor/github.com/golang/snappy/CONTRIBUTORS
+++ /dev/null
@@ -1,41 +0,0 @@
-# This is the official list of people who can contribute
-# (and typically have contributed) code to the Snappy-Go repository.
-# The AUTHORS file lists the copyright holders; this file
-# lists people.  For example, Google employees are listed here
-# but not in AUTHORS, because Google holds the copyright.
-#
-# The submission process automatically checks to make sure
-# that people submitting code are listed in this file (by email address).
-#
-# Names should be added to this file only after verifying that
-# the individual or the individual's organization has agreed to
-# the appropriate Contributor License Agreement, found here:
-#
-#     http://code.google.com/legal/individual-cla-v1.0.html
-#     http://code.google.com/legal/corporate-cla-v1.0.html
-#
-# The agreement for individuals can be filled out on the web.
-#
-# When adding J Random Contributor's name to this file,
-# either J's name or J's organization's name should be
-# added to the AUTHORS file, depending on whether the
-# individual or corporate CLA was used.
-
-# Names should be added to this file like so:
-#     Name <email address>
-
-# Please keep the list sorted.
-
-Alex Legg <alexlegg@google.com>
-Damian Gryski <dgryski@gmail.com>
-Eric Buth <eric@topos.com>
-Jan Mercl <0xjnml@gmail.com>
-Jonathan Swinney <jswinney@amazon.com>
-Kai Backman <kaib@golang.org>
-Klaus Post <klauspost@gmail.com>
-Marc-Antoine Ruel <maruel@chromium.org>
-Nigel Tao <nigeltao@golang.org>
-Rob Pike <r@golang.org>
-Rodolfo Carvalho <rhcarvalho@gmail.com>
-Russ Cox <rsc@golang.org>
-Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/golang/snappy/README b/vendor/github.com/golang/snappy/README
deleted file mode 100644
index cea12879a0..0000000000
--- a/vendor/github.com/golang/snappy/README
+++ /dev/null
@@ -1,107 +0,0 @@
-The Snappy compression format in the Go programming language.
-
-To download and install from source:
-$ go get github.com/golang/snappy
-
-Unless otherwise noted, the Snappy-Go source files are distributed
-under the BSD-style license found in the LICENSE file.
-
-
-
-Benchmarks.
-
-The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
-or so files, the same set used by the C++ Snappy code (github.com/google/snappy
-and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
-3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29:
-
-"go test -test.bench=."
-
-_UFlat0-8         2.19GB/s ± 0%  html
-_UFlat1-8         1.41GB/s ± 0%  urls
-_UFlat2-8         23.5GB/s ± 2%  jpg
-_UFlat3-8         1.91GB/s ± 0%  jpg_200
-_UFlat4-8         14.0GB/s ± 1%  pdf
-_UFlat5-8         1.97GB/s ± 0%  html4
-_UFlat6-8          814MB/s ± 0%  txt1
-_UFlat7-8          785MB/s ± 0%  txt2
-_UFlat8-8          857MB/s ± 0%  txt3
-_UFlat9-8          719MB/s ± 1%  txt4
-_UFlat10-8        2.84GB/s ± 0%  pb
-_UFlat11-8        1.05GB/s ± 0%  gaviota
-
-_ZFlat0-8         1.04GB/s ± 0%  html
-_ZFlat1-8          534MB/s ± 0%  urls
-_ZFlat2-8         15.7GB/s ± 1%  jpg
-_ZFlat3-8          740MB/s ± 3%  jpg_200
-_ZFlat4-8         9.20GB/s ± 1%  pdf
-_ZFlat5-8          991MB/s ± 0%  html4
-_ZFlat6-8          379MB/s ± 0%  txt1
-_ZFlat7-8          352MB/s ± 0%  txt2
-_ZFlat8-8          396MB/s ± 1%  txt3
-_ZFlat9-8          327MB/s ± 1%  txt4
-_ZFlat10-8        1.33GB/s ± 1%  pb
-_ZFlat11-8         605MB/s ± 1%  gaviota
-
-
-
-"go test -test.bench=. -tags=noasm"
-
-_UFlat0-8          621MB/s ± 2%  html
-_UFlat1-8          494MB/s ± 1%  urls
-_UFlat2-8         23.2GB/s ± 1%  jpg
-_UFlat3-8         1.12GB/s ± 1%  jpg_200
-_UFlat4-8         4.35GB/s ± 1%  pdf
-_UFlat5-8          609MB/s ± 0%  html4
-_UFlat6-8          296MB/s ± 0%  txt1
-_UFlat7-8          288MB/s ± 0%  txt2
-_UFlat8-8          309MB/s ± 1%  txt3
-_UFlat9-8          280MB/s ± 1%  txt4
-_UFlat10-8         753MB/s ± 0%  pb
-_UFlat11-8         400MB/s ± 0%  gaviota
-
-_ZFlat0-8          409MB/s ± 1%  html
-_ZFlat1-8          250MB/s ± 1%  urls
-_ZFlat2-8         12.3GB/s ± 1%  jpg
-_ZFlat3-8          132MB/s ± 0%  jpg_200
-_ZFlat4-8         2.92GB/s ± 0%  pdf
-_ZFlat5-8          405MB/s ± 1%  html4
-_ZFlat6-8          179MB/s ± 1%  txt1
-_ZFlat7-8          170MB/s ± 1%  txt2
-_ZFlat8-8          189MB/s ± 1%  txt3
-_ZFlat9-8          164MB/s ± 1%  txt4
-_ZFlat10-8         479MB/s ± 1%  pb
-_ZFlat11-8         270MB/s ± 1%  gaviota
-
-
-
-For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
-are the numbers from C++ Snappy's
-
-make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
-
-BM_UFlat/0     2.4GB/s  html
-BM_UFlat/1     1.4GB/s  urls
-BM_UFlat/2    21.8GB/s  jpg
-BM_UFlat/3     1.5GB/s  jpg_200
-BM_UFlat/4    13.3GB/s  pdf
-BM_UFlat/5     2.1GB/s  html4
-BM_UFlat/6     1.0GB/s  txt1
-BM_UFlat/7   959.4MB/s  txt2
-BM_UFlat/8     1.0GB/s  txt3
-BM_UFlat/9   864.5MB/s  txt4
-BM_UFlat/10    2.9GB/s  pb
-BM_UFlat/11    1.2GB/s  gaviota
-
-BM_ZFlat/0   944.3MB/s  html (22.31 %)
-BM_ZFlat/1   501.6MB/s  urls (47.78 %)
-BM_ZFlat/2    14.3GB/s  jpg (99.95 %)
-BM_ZFlat/3   538.3MB/s  jpg_200 (73.00 %)
-BM_ZFlat/4     8.3GB/s  pdf (83.30 %)
-BM_ZFlat/5   903.5MB/s  html4 (22.52 %)
-BM_ZFlat/6   336.0MB/s  txt1 (57.88 %)
-BM_ZFlat/7   312.3MB/s  txt2 (61.91 %)
-BM_ZFlat/8   353.1MB/s  txt3 (54.99 %)
-BM_ZFlat/9   289.9MB/s  txt4 (66.26 %)
-BM_ZFlat/10    1.2GB/s  pb (19.68 %)
-BM_ZFlat/11  527.4MB/s  gaviota (37.72 %)
diff --git a/vendor/github.com/golang/snappy/encode_amd64.s b/vendor/github.com/golang/snappy/encode_amd64.s
deleted file mode 100644
index adfd979fe2..0000000000
--- a/vendor/github.com/golang/snappy/encode_amd64.s
+++ /dev/null
@@ -1,730 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-
-// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
-// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
-// https://github.com/golang/snappy/issues/29
-//
-// As a workaround, the package was built with a known good assembler, and
-// those instructions were disassembled by "objdump -d" to yield the
-//	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-// style comments, in AT&T asm syntax. Note that rsp here is a physical
-// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
-// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
-// fine on Go 1.6.
-
-// The asm code generally follows the pure Go code in encode_other.go, except
-// where marked with a "!!!".
-
-// ----------------------------------------------------------------------------
-
-// func emitLiteral(dst, lit []byte) int
-//
-// All local variables fit into registers. The register allocation:
-//	- AX	len(lit)
-//	- BX	n
-//	- DX	return value
-//	- DI	&dst[i]
-//	- R10	&lit[0]
-//
-// The 24 bytes of stack space is to call runtime·memmove.
-//
-// The unusual register allocation of local variables, such as R10 for the
-// source pointer, matches the allocation used at the call site in encodeBlock,
-// which makes it easier to manually inline this function.
-TEXT ·emitLiteral(SB), NOSPLIT, $24-56
-	MOVQ dst_base+0(FP), DI
-	MOVQ lit_base+24(FP), R10
-	MOVQ lit_len+32(FP), AX
-	MOVQ AX, DX
-	MOVL AX, BX
-	SUBL $1, BX
-
-	CMPL BX, $60
-	JLT  oneByte
-	CMPL BX, $256
-	JLT  twoBytes
-
-threeBytes:
-	MOVB $0xf4, 0(DI)
-	MOVW BX, 1(DI)
-	ADDQ $3, DI
-	ADDQ $3, DX
-	JMP  memmove
-
-twoBytes:
-	MOVB $0xf0, 0(DI)
-	MOVB BX, 1(DI)
-	ADDQ $2, DI
-	ADDQ $2, DX
-	JMP  memmove
-
-oneByte:
-	SHLB $2, BX
-	MOVB BX, 0(DI)
-	ADDQ $1, DI
-	ADDQ $1, DX
-
-memmove:
-	MOVQ DX, ret+48(FP)
-
-	// copy(dst[i:], lit)
-	//
-	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// DI, R10 and AX as arguments.
-	MOVQ DI, 0(SP)
-	MOVQ R10, 8(SP)
-	MOVQ AX, 16(SP)
-	CALL runtime·memmove(SB)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func emitCopy(dst []byte, offset, length int) int
-//
-// All local variables fit into registers. The register allocation:
-//	- AX	length
-//	- SI	&dst[0]
-//	- DI	&dst[i]
-//	- R11	offset
-//
-// The unusual register allocation of local variables, such as R11 for the
-// offset, matches the allocation used at the call site in encodeBlock, which
-// makes it easier to manually inline this function.
-TEXT ·emitCopy(SB), NOSPLIT, $0-48
-	MOVQ dst_base+0(FP), DI
-	MOVQ DI, SI
-	MOVQ offset+24(FP), R11
-	MOVQ length+32(FP), AX
-
-loop0:
-	// for length >= 68 { etc }
-	CMPL AX, $68
-	JLT  step1
-
-	// Emit a length 64 copy, encoded as 3 bytes.
-	MOVB $0xfe, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $64, AX
-	JMP  loop0
-
-step1:
-	// if length > 64 { etc }
-	CMPL AX, $64
-	JLE  step2
-
-	// Emit a length 60 copy, encoded as 3 bytes.
-	MOVB $0xee, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $60, AX
-
-step2:
-	// if length >= 12 || offset >= 2048 { goto step3 }
-	CMPL AX, $12
-	JGE  step3
-	CMPL R11, $2048
-	JGE  step3
-
-	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB R11, 1(DI)
-	SHRL $8, R11
-	SHLB $5, R11
-	SUBB $4, AX
-	SHLB $2, AX
-	ORB  AX, R11
-	ORB  $1, R11
-	MOVB R11, 0(DI)
-	ADDQ $2, DI
-
-	// Return the number of bytes written.
-	SUBQ SI, DI
-	MOVQ DI, ret+40(FP)
-	RET
-
-step3:
-	// Emit the remaining copy, encoded as 3 bytes.
-	SUBL $1, AX
-	SHLB $2, AX
-	ORB  $2, AX
-	MOVB AX, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-
-	// Return the number of bytes written.
-	SUBQ SI, DI
-	MOVQ DI, ret+40(FP)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func extendMatch(src []byte, i, j int) int
-//
-// All local variables fit into registers. The register allocation:
-//	- DX	&src[0]
-//	- SI	&src[j]
-//	- R13	&src[len(src) - 8]
-//	- R14	&src[len(src)]
-//	- R15	&src[i]
-//
-// The unusual register allocation of local variables, such as R15 for a source
-// pointer, matches the allocation used at the call site in encodeBlock, which
-// makes it easier to manually inline this function.
-TEXT ·extendMatch(SB), NOSPLIT, $0-48
-	MOVQ src_base+0(FP), DX
-	MOVQ src_len+8(FP), R14
-	MOVQ i+24(FP), R15
-	MOVQ j+32(FP), SI
-	ADDQ DX, R14
-	ADDQ DX, R15
-	ADDQ DX, SI
-	MOVQ R14, R13
-	SUBQ $8, R13
-
-cmp8:
-	// As long as we are 8 or more bytes before the end of src, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ SI, R13
-	JA   cmp1
-	MOVQ (R15), AX
-	MOVQ (SI), BX
-	CMPQ AX, BX
-	JNE  bsf
-	ADDQ $8, R15
-	ADDQ $8, SI
-	JMP  cmp8
-
-bsf:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs. The BSF instruction finds the
-	// least significant 1 bit, the amd64 architecture is little-endian, and
-	// the shift by 3 converts a bit index to a byte index.
-	XORQ AX, BX
-	BSFQ BX, BX
-	SHRQ $3, BX
-	ADDQ BX, SI
-
-	// Convert from &src[ret] to ret.
-	SUBQ DX, SI
-	MOVQ SI, ret+40(FP)
-	RET
-
-cmp1:
-	// In src's tail, compare 1 byte at a time.
-	CMPQ SI, R14
-	JAE  extendMatchEnd
-	MOVB (R15), AX
-	MOVB (SI), BX
-	CMPB AX, BX
-	JNE  extendMatchEnd
-	ADDQ $1, R15
-	ADDQ $1, SI
-	JMP  cmp1
-
-extendMatchEnd:
-	// Convert from &src[ret] to ret.
-	SUBQ DX, SI
-	MOVQ SI, ret+40(FP)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func encodeBlock(dst, src []byte) (d int)
-//
-// All local variables fit into registers, other than "var table". The register
-// allocation:
-//	- AX	.	.
-//	- BX	.	.
-//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
-//	- DX	64	&src[0], tableSize
-//	- SI	72	&src[s]
-//	- DI	80	&dst[d]
-//	- R9	88	sLimit
-//	- R10	.	&src[nextEmit]
-//	- R11	96	prevHash, currHash, nextHash, offset
-//	- R12	104	&src[base], skip
-//	- R13	.	&src[nextS], &src[len(src) - 8]
-//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
-//	- R15	112	candidate
-//
-// The second column (56, 64, etc) is the stack offset to spill the registers
-// when calling other functions. We could pack this slightly tighter, but it's
-// simpler to have a dedicated spill map independent of the function called.
-//
-// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
-// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
-// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
-TEXT ·encodeBlock(SB), 0, $32888-56
-	MOVQ dst_base+0(FP), DI
-	MOVQ src_base+24(FP), SI
-	MOVQ src_len+32(FP), R14
-
-	// shift, tableSize := uint32(32-8), 1<<8
-	MOVQ $24, CX
-	MOVQ $256, DX
-
-calcShift:
-	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
-	//	shift--
-	// }
-	CMPQ DX, $16384
-	JGE  varTable
-	CMPQ DX, R14
-	JGE  varTable
-	SUBQ $1, CX
-	SHLQ $1, DX
-	JMP  calcShift
-
-varTable:
-	// var table [maxTableSize]uint16
-	//
-	// In the asm code, unlike the Go code, we can zero-initialize only the
-	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
-	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
-	// 2048 writes that would zero-initialize all of table's 32768 bytes.
-	SHRQ $3, DX
-	LEAQ table-32768(SP), BX
-	PXOR X0, X0
-
-memclr:
-	MOVOU X0, 0(BX)
-	ADDQ  $16, BX
-	SUBQ  $1, DX
-	JNZ   memclr
-
-	// !!! DX = &src[0]
-	MOVQ SI, DX
-
-	// sLimit := len(src) - inputMargin
-	MOVQ R14, R9
-	SUBQ $15, R9
-
-	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
-	// change for the rest of the function.
-	MOVQ CX, 56(SP)
-	MOVQ DX, 64(SP)
-	MOVQ R9, 88(SP)
-
-	// nextEmit := 0
-	MOVQ DX, R10
-
-	// s := 1
-	ADDQ $1, SI
-
-	// nextHash := hash(load32(src, s), shift)
-	MOVL  0(SI), R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-outer:
-	// for { etc }
-
-	// skip := 32
-	MOVQ $32, R12
-
-	// nextS := s
-	MOVQ SI, R13
-
-	// candidate := 0
-	MOVQ $0, R15
-
-inner0:
-	// for { etc }
-
-	// s := nextS
-	MOVQ R13, SI
-
-	// bytesBetweenHashLookups := skip >> 5
-	MOVQ R12, R14
-	SHRQ $5, R14
-
-	// nextS = s + bytesBetweenHashLookups
-	ADDQ R14, R13
-
-	// skip += bytesBetweenHashLookups
-	ADDQ R14, R12
-
-	// if nextS > sLimit { goto emitRemainder }
-	MOVQ R13, AX
-	SUBQ DX, AX
-	CMPQ AX, R9
-	JA   emitRemainder
-
-	// candidate = int(table[nextHash])
-	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
-	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-	BYTE $0x4e
-	BYTE $0x0f
-	BYTE $0xb7
-	BYTE $0x7c
-	BYTE $0x5c
-	BYTE $0x78
-
-	// table[nextHash] = uint16(s)
-	MOVQ SI, AX
-	SUBQ DX, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// nextHash = hash(load32(src, nextS), shift)
-	MOVL  0(R13), R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// if load32(src, s) != load32(src, candidate) { continue } break
-	MOVL 0(SI), AX
-	MOVL (DX)(R15*1), BX
-	CMPL AX, BX
-	JNE  inner0
-
-fourByteMatch:
-	// As per the encode_other.go code:
-	//
-	// A 4-byte match has been found. We'll later see etc.
-
-	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
-	// on inputMargin in encode.go.
-	MOVQ SI, AX
-	SUBQ R10, AX
-	CMPQ AX, $16
-	JLE  emitLiteralFastPath
-
-	// ----------------------------------------
-	// Begin inline of the emitLiteral call.
-	//
-	// d += emitLiteral(dst[d:], src[nextEmit:s])
-
-	MOVL AX, BX
-	SUBL $1, BX
-
-	CMPL BX, $60
-	JLT  inlineEmitLiteralOneByte
-	CMPL BX, $256
-	JLT  inlineEmitLiteralTwoBytes
-
-inlineEmitLiteralThreeBytes:
-	MOVB $0xf4, 0(DI)
-	MOVW BX, 1(DI)
-	ADDQ $3, DI
-	JMP  inlineEmitLiteralMemmove
-
-inlineEmitLiteralTwoBytes:
-	MOVB $0xf0, 0(DI)
-	MOVB BX, 1(DI)
-	ADDQ $2, DI
-	JMP  inlineEmitLiteralMemmove
-
-inlineEmitLiteralOneByte:
-	SHLB $2, BX
-	MOVB BX, 0(DI)
-	ADDQ $1, DI
-
-inlineEmitLiteralMemmove:
-	// Spill local variables (registers) onto the stack; call; unspill.
-	//
-	// copy(dst[i:], lit)
-	//
-	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// DI, R10 and AX as arguments.
-	MOVQ DI, 0(SP)
-	MOVQ R10, 8(SP)
-	MOVQ AX, 16(SP)
-	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
-	MOVQ SI, 72(SP)
-	MOVQ DI, 80(SP)
-	MOVQ R15, 112(SP)
-	CALL runtime·memmove(SB)
-	MOVQ 56(SP), CX
-	MOVQ 64(SP), DX
-	MOVQ 72(SP), SI
-	MOVQ 80(SP), DI
-	MOVQ 88(SP), R9
-	MOVQ 112(SP), R15
-	JMP  inner1
-
-inlineEmitLiteralEnd:
-	// End inline of the emitLiteral call.
-	// ----------------------------------------
-
-emitLiteralFastPath:
-	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
-	MOVB AX, BX
-	SUBB $1, BX
-	SHLB $2, BX
-	MOVB BX, (DI)
-	ADDQ $1, DI
-
-	// !!! Implement the copy from lit to dst as a 16-byte load and store.
-	// (Encode's documentation says that dst and src must not overlap.)
-	//
-	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
-	// OK. Subsequent iterations will fix up the overrun.
-	//
-	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
-	// 16-byte loads and stores. This technique probably wouldn't be as
-	// effective on architectures that are fussier about alignment.
-	MOVOU 0(R10), X0
-	MOVOU X0, 0(DI)
-	ADDQ  AX, DI
-
-inner1:
-	// for { etc }
-
-	// base := s
-	MOVQ SI, R12
-
-	// !!! offset := base - candidate
-	MOVQ R12, R11
-	SUBQ R15, R11
-	SUBQ DX, R11
-
-	// ----------------------------------------
-	// Begin inline of the extendMatch call.
-	//
-	// s = extendMatch(src, candidate+4, s+4)
-
-	// !!! R14 = &src[len(src)]
-	MOVQ src_len+32(FP), R14
-	ADDQ DX, R14
-
-	// !!! R13 = &src[len(src) - 8]
-	MOVQ R14, R13
-	SUBQ $8, R13
-
-	// !!! R15 = &src[candidate + 4]
-	ADDQ $4, R15
-	ADDQ DX, R15
-
-	// !!! s += 4
-	ADDQ $4, SI
-
-inlineExtendMatchCmp8:
-	// As long as we are 8 or more bytes before the end of src, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ SI, R13
-	JA   inlineExtendMatchCmp1
-	MOVQ (R15), AX
-	MOVQ (SI), BX
-	CMPQ AX, BX
-	JNE  inlineExtendMatchBSF
-	ADDQ $8, R15
-	ADDQ $8, SI
-	JMP  inlineExtendMatchCmp8
-
-inlineExtendMatchBSF:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs. The BSF instruction finds the
-	// least significant 1 bit, the amd64 architecture is little-endian, and
-	// the shift by 3 converts a bit index to a byte index.
-	XORQ AX, BX
-	BSFQ BX, BX
-	SHRQ $3, BX
-	ADDQ BX, SI
-	JMP  inlineExtendMatchEnd
-
-inlineExtendMatchCmp1:
-	// In src's tail, compare 1 byte at a time.
-	CMPQ SI, R14
-	JAE  inlineExtendMatchEnd
-	MOVB (R15), AX
-	MOVB (SI), BX
-	CMPB AX, BX
-	JNE  inlineExtendMatchEnd
-	ADDQ $1, R15
-	ADDQ $1, SI
-	JMP  inlineExtendMatchCmp1
-
-inlineExtendMatchEnd:
-	// End inline of the extendMatch call.
-	// ----------------------------------------
-
-	// ----------------------------------------
-	// Begin inline of the emitCopy call.
-	//
-	// d += emitCopy(dst[d:], base-candidate, s-base)
-
-	// !!! length := s - base
-	MOVQ SI, AX
-	SUBQ R12, AX
-
-inlineEmitCopyLoop0:
-	// for length >= 68 { etc }
-	CMPL AX, $68
-	JLT  inlineEmitCopyStep1
-
-	// Emit a length 64 copy, encoded as 3 bytes.
-	MOVB $0xfe, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $64, AX
-	JMP  inlineEmitCopyLoop0
-
-inlineEmitCopyStep1:
-	// if length > 64 { etc }
-	CMPL AX, $64
-	JLE  inlineEmitCopyStep2
-
-	// Emit a length 60 copy, encoded as 3 bytes.
-	MOVB $0xee, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $60, AX
-
-inlineEmitCopyStep2:
-	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
-	CMPL AX, $12
-	JGE  inlineEmitCopyStep3
-	CMPL R11, $2048
-	JGE  inlineEmitCopyStep3
-
-	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB R11, 1(DI)
-	SHRL $8, R11
-	SHLB $5, R11
-	SUBB $4, AX
-	SHLB $2, AX
-	ORB  AX, R11
-	ORB  $1, R11
-	MOVB R11, 0(DI)
-	ADDQ $2, DI
-	JMP  inlineEmitCopyEnd
-
-inlineEmitCopyStep3:
-	// Emit the remaining copy, encoded as 3 bytes.
-	SUBL $1, AX
-	SHLB $2, AX
-	ORB  $2, AX
-	MOVB AX, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-
-inlineEmitCopyEnd:
-	// End inline of the emitCopy call.
-	// ----------------------------------------
-
-	// nextEmit = s
-	MOVQ SI, R10
-
-	// if s >= sLimit { goto emitRemainder }
-	MOVQ SI, AX
-	SUBQ DX, AX
-	CMPQ AX, R9
-	JAE  emitRemainder
-
-	// As per the encode_other.go code:
-	//
-	// We could immediately etc.
-
-	// x := load64(src, s-1)
-	MOVQ -1(SI), R14
-
-	// prevHash := hash(uint32(x>>0), shift)
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// table[prevHash] = uint16(s-1)
-	MOVQ SI, AX
-	SUBQ DX, AX
-	SUBQ $1, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// currHash := hash(uint32(x>>8), shift)
-	SHRQ  $8, R14
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// candidate = int(table[currHash])
-	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
-	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-	BYTE $0x4e
-	BYTE $0x0f
-	BYTE $0xb7
-	BYTE $0x7c
-	BYTE $0x5c
-	BYTE $0x78
-
-	// table[currHash] = uint16(s)
-	ADDQ $1, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// if uint32(x>>8) == load32(src, candidate) { continue }
-	MOVL (DX)(R15*1), BX
-	CMPL R14, BX
-	JEQ  inner1
-
-	// nextHash = hash(uint32(x>>16), shift)
-	SHRQ  $8, R14
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// s++
-	ADDQ $1, SI
-
-	// break out of the inner1 for loop, i.e. continue the outer loop.
-	JMP outer
-
-emitRemainder:
-	// if nextEmit < len(src) { etc }
-	MOVQ src_len+32(FP), AX
-	ADDQ DX, AX
-	CMPQ R10, AX
-	JEQ  encodeBlockEnd
-
-	// d += emitLiteral(dst[d:], src[nextEmit:])
-	//
-	// Push args.
-	MOVQ DI, 0(SP)
-	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ R10, 24(SP)
-	SUBQ R10, AX
-	MOVQ AX, 32(SP)
-	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
-
-	// Spill local variables (registers) onto the stack; call; unspill.
-	MOVQ DI, 80(SP)
-	CALL ·emitLiteral(SB)
-	MOVQ 80(SP), DI
-
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADDQ 48(SP), DI
-
-encodeBlockEnd:
-	MOVQ dst_base+0(FP), AX
-	SUBQ AX, DI
-	MOVQ DI, d+48(FP)
-	RET
diff --git a/vendor/github.com/golang/snappy/encode_arm64.s b/vendor/github.com/golang/snappy/encode_arm64.s
deleted file mode 100644
index f8d54adfc5..0000000000
--- a/vendor/github.com/golang/snappy/encode_arm64.s
+++ /dev/null
@@ -1,722 +0,0 @@
-// Copyright 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-
-// The asm code generally follows the pure Go code in encode_other.go, except
-// where marked with a "!!!".
-
-// ----------------------------------------------------------------------------
-
-// func emitLiteral(dst, lit []byte) int
-//
-// All local variables fit into registers. The register allocation:
-//	- R3	len(lit)
-//	- R4	n
-//	- R6	return value
-//	- R8	&dst[i]
-//	- R10	&lit[0]
-//
-// The 32 bytes of stack space is to call runtime·memmove.
-//
-// The unusual register allocation of local variables, such as R10 for the
-// source pointer, matches the allocation used at the call site in encodeBlock,
-// which makes it easier to manually inline this function.
-TEXT ·emitLiteral(SB), NOSPLIT, $32-56
-	MOVD dst_base+0(FP), R8
-	MOVD lit_base+24(FP), R10
-	MOVD lit_len+32(FP), R3
-	MOVD R3, R6
-	MOVW R3, R4
-	SUBW $1, R4, R4
-
-	CMPW $60, R4
-	BLT  oneByte
-	CMPW $256, R4
-	BLT  twoBytes
-
-threeBytes:
-	MOVD $0xf4, R2
-	MOVB R2, 0(R8)
-	MOVW R4, 1(R8)
-	ADD  $3, R8, R8
-	ADD  $3, R6, R6
-	B    memmove
-
-twoBytes:
-	MOVD $0xf0, R2
-	MOVB R2, 0(R8)
-	MOVB R4, 1(R8)
-	ADD  $2, R8, R8
-	ADD  $2, R6, R6
-	B    memmove
-
-oneByte:
-	LSLW $2, R4, R4
-	MOVB R4, 0(R8)
-	ADD  $1, R8, R8
-	ADD  $1, R6, R6
-
-memmove:
-	MOVD R6, ret+48(FP)
-
-	// copy(dst[i:], lit)
-	//
-	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// R8, R10 and R3 as arguments.
-	MOVD R8, 8(RSP)
-	MOVD R10, 16(RSP)
-	MOVD R3, 24(RSP)
-	CALL runtime·memmove(SB)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func emitCopy(dst []byte, offset, length int) int
-//
-// All local variables fit into registers. The register allocation:
-//	- R3	length
-//	- R7	&dst[0]
-//	- R8	&dst[i]
-//	- R11	offset
-//
-// The unusual register allocation of local variables, such as R11 for the
-// offset, matches the allocation used at the call site in encodeBlock, which
-// makes it easier to manually inline this function.
-TEXT ·emitCopy(SB), NOSPLIT, $0-48
-	MOVD dst_base+0(FP), R8
-	MOVD R8, R7
-	MOVD offset+24(FP), R11
-	MOVD length+32(FP), R3
-
-loop0:
-	// for length >= 68 { etc }
-	CMPW $68, R3
-	BLT  step1
-
-	// Emit a length 64 copy, encoded as 3 bytes.
-	MOVD $0xfe, R2
-	MOVB R2, 0(R8)
-	MOVW R11, 1(R8)
-	ADD  $3, R8, R8
-	SUB  $64, R3, R3
-	B    loop0
-
-step1:
-	// if length > 64 { etc }
-	CMP $64, R3
-	BLE step2
-
-	// Emit a length 60 copy, encoded as 3 bytes.
-	MOVD $0xee, R2
-	MOVB R2, 0(R8)
-	MOVW R11, 1(R8)
-	ADD  $3, R8, R8
-	SUB  $60, R3, R3
-
-step2:
-	// if length >= 12 || offset >= 2048 { goto step3 }
-	CMP  $12, R3
-	BGE  step3
-	CMPW $2048, R11
-	BGE  step3
-
-	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB R11, 1(R8)
-	LSRW $3, R11, R11
-	AND  $0xe0, R11, R11
-	SUB  $4, R3, R3
-	LSLW $2, R3
-	AND  $0xff, R3, R3
-	ORRW R3, R11, R11
-	ORRW $1, R11, R11
-	MOVB R11, 0(R8)
-	ADD  $2, R8, R8
-
-	// Return the number of bytes written.
-	SUB  R7, R8, R8
-	MOVD R8, ret+40(FP)
-	RET
-
-step3:
-	// Emit the remaining copy, encoded as 3 bytes.
-	SUB  $1, R3, R3
-	AND  $0xff, R3, R3
-	LSLW $2, R3, R3
-	ORRW $2, R3, R3
-	MOVB R3, 0(R8)
-	MOVW R11, 1(R8)
-	ADD  $3, R8, R8
-
-	// Return the number of bytes written.
-	SUB  R7, R8, R8
-	MOVD R8, ret+40(FP)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func extendMatch(src []byte, i, j int) int
-//
-// All local variables fit into registers. The register allocation:
-//	- R6	&src[0]
-//	- R7	&src[j]
-//	- R13	&src[len(src) - 8]
-//	- R14	&src[len(src)]
-//	- R15	&src[i]
-//
-// The unusual register allocation of local variables, such as R15 for a source
-// pointer, matches the allocation used at the call site in encodeBlock, which
-// makes it easier to manually inline this function.
-TEXT ·extendMatch(SB), NOSPLIT, $0-48
-	MOVD src_base+0(FP), R6
-	MOVD src_len+8(FP), R14
-	MOVD i+24(FP), R15
-	MOVD j+32(FP), R7
-	ADD  R6, R14, R14
-	ADD  R6, R15, R15
-	ADD  R6, R7, R7
-	MOVD R14, R13
-	SUB  $8, R13, R13
-
-cmp8:
-	// As long as we are 8 or more bytes before the end of src, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMP  R13, R7
-	BHI  cmp1
-	MOVD (R15), R3
-	MOVD (R7), R4
-	CMP  R4, R3
-	BNE  bsf
-	ADD  $8, R15, R15
-	ADD  $8, R7, R7
-	B    cmp8
-
-bsf:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs.
-	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
-	// combination of which finds the least significant bit which is set.
-	// The arm64 architecture is little-endian, and the shift by 3 converts
-	// a bit index to a byte index.
-	EOR  R3, R4, R4
-	RBIT R4, R4
-	CLZ  R4, R4
-	ADD  R4>>3, R7, R7
-
-	// Convert from &src[ret] to ret.
-	SUB  R6, R7, R7
-	MOVD R7, ret+40(FP)
-	RET
-
-cmp1:
-	// In src's tail, compare 1 byte at a time.
-	CMP  R7, R14
-	BLS  extendMatchEnd
-	MOVB (R15), R3
-	MOVB (R7), R4
-	CMP  R4, R3
-	BNE  extendMatchEnd
-	ADD  $1, R15, R15
-	ADD  $1, R7, R7
-	B    cmp1
-
-extendMatchEnd:
-	// Convert from &src[ret] to ret.
-	SUB  R6, R7, R7
-	MOVD R7, ret+40(FP)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func encodeBlock(dst, src []byte) (d int)
-//
-// All local variables fit into registers, other than "var table". The register
-// allocation:
-//	- R3	.	.
-//	- R4	.	.
-//	- R5	64	shift
-//	- R6	72	&src[0], tableSize
-//	- R7	80	&src[s]
-//	- R8	88	&dst[d]
-//	- R9	96	sLimit
-//	- R10	.	&src[nextEmit]
-//	- R11	104	prevHash, currHash, nextHash, offset
-//	- R12	112	&src[base], skip
-//	- R13	.	&src[nextS], &src[len(src) - 8]
-//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
-//	- R15	120	candidate
-//	- R16	.	hash constant, 0x1e35a7bd
-//	- R17	.	&table
-//	- .  	128	table
-//
-// The second column (64, 72, etc) is the stack offset to spill the registers
-// when calling other functions. We could pack this slightly tighter, but it's
-// simpler to have a dedicated spill map independent of the function called.
-//
-// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
-// extra 64 bytes, to call other functions, and an extra 64 bytes, to spill
-// local variables (registers) during calls gives 32768 + 64 + 64 = 32896.
-TEXT ·encodeBlock(SB), 0, $32896-56
-	MOVD dst_base+0(FP), R8
-	MOVD src_base+24(FP), R7
-	MOVD src_len+32(FP), R14
-
-	// shift, tableSize := uint32(32-8), 1<<8
-	MOVD  $24, R5
-	MOVD  $256, R6
-	MOVW  $0xa7bd, R16
-	MOVKW $(0x1e35<<16), R16
-
-calcShift:
-	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
-	//	shift--
-	// }
-	MOVD $16384, R2
-	CMP  R2, R6
-	BGE  varTable
-	CMP  R14, R6
-	BGE  varTable
-	SUB  $1, R5, R5
-	LSL  $1, R6, R6
-	B    calcShift
-
-varTable:
-	// var table [maxTableSize]uint16
-	//
-	// In the asm code, unlike the Go code, we can zero-initialize only the
-	// first tableSize elements. Each uint16 element is 2 bytes and each
-	// iterations writes 64 bytes, so we can do only tableSize/32 writes
-	// instead of the 2048 writes that would zero-initialize all of table's
-	// 32768 bytes. This clear could overrun the first tableSize elements, but
-	// it won't overrun the allocated stack size.
-	ADD  $128, RSP, R17
-	MOVD R17, R4
-
-	// !!! R6 = &src[tableSize]
-	ADD R6<<1, R17, R6
-
-memclr:
-	STP.P (ZR, ZR), 64(R4)
-	STP   (ZR, ZR), -48(R4)
-	STP   (ZR, ZR), -32(R4)
-	STP   (ZR, ZR), -16(R4)
-	CMP   R4, R6
-	BHI   memclr
-
-	// !!! R6 = &src[0]
-	MOVD R7, R6
-
-	// sLimit := len(src) - inputMargin
-	MOVD R14, R9
-	SUB  $15, R9, R9
-
-	// !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't
-	// change for the rest of the function.
-	MOVD R5, 64(RSP)
-	MOVD R6, 72(RSP)
-	MOVD R9, 96(RSP)
-
-	// nextEmit := 0
-	MOVD R6, R10
-
-	// s := 1
-	ADD $1, R7, R7
-
-	// nextHash := hash(load32(src, s), shift)
-	MOVW 0(R7), R11
-	MULW R16, R11, R11
-	LSRW R5, R11, R11
-
-outer:
-	// for { etc }
-
-	// skip := 32
-	MOVD $32, R12
-
-	// nextS := s
-	MOVD R7, R13
-
-	// candidate := 0
-	MOVD $0, R15
-
-inner0:
-	// for { etc }
-
-	// s := nextS
-	MOVD R13, R7
-
-	// bytesBetweenHashLookups := skip >> 5
-	MOVD R12, R14
-	LSR  $5, R14, R14
-
-	// nextS = s + bytesBetweenHashLookups
-	ADD R14, R13, R13
-
-	// skip += bytesBetweenHashLookups
-	ADD R14, R12, R12
-
-	// if nextS > sLimit { goto emitRemainder }
-	MOVD R13, R3
-	SUB  R6, R3, R3
-	CMP  R9, R3
-	BHI  emitRemainder
-
-	// candidate = int(table[nextHash])
-	MOVHU 0(R17)(R11<<1), R15
-
-	// table[nextHash] = uint16(s)
-	MOVD R7, R3
-	SUB  R6, R3, R3
-
-	MOVH R3, 0(R17)(R11<<1)
-
-	// nextHash = hash(load32(src, nextS), shift)
-	MOVW 0(R13), R11
-	MULW R16, R11
-	LSRW R5, R11, R11
-
-	// if load32(src, s) != load32(src, candidate) { continue } break
-	MOVW 0(R7), R3
-	MOVW (R6)(R15), R4
-	CMPW R4, R3
-	BNE  inner0
-
-fourByteMatch:
-	// As per the encode_other.go code:
-	//
-	// A 4-byte match has been found. We'll later see etc.
-
-	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
-	// on inputMargin in encode.go.
-	MOVD R7, R3
-	SUB  R10, R3, R3
-	CMP  $16, R3
-	BLE  emitLiteralFastPath
-
-	// ----------------------------------------
-	// Begin inline of the emitLiteral call.
-	//
-	// d += emitLiteral(dst[d:], src[nextEmit:s])
-
-	MOVW R3, R4
-	SUBW $1, R4, R4
-
-	MOVW $60, R2
-	CMPW R2, R4
-	BLT  inlineEmitLiteralOneByte
-	MOVW $256, R2
-	CMPW R2, R4
-	BLT  inlineEmitLiteralTwoBytes
-
-inlineEmitLiteralThreeBytes:
-	MOVD $0xf4, R1
-	MOVB R1, 0(R8)
-	MOVW R4, 1(R8)
-	ADD  $3, R8, R8
-	B    inlineEmitLiteralMemmove
-
-inlineEmitLiteralTwoBytes:
-	MOVD $0xf0, R1
-	MOVB R1, 0(R8)
-	MOVB R4, 1(R8)
-	ADD  $2, R8, R8
-	B    inlineEmitLiteralMemmove
-
-inlineEmitLiteralOneByte:
-	LSLW $2, R4, R4
-	MOVB R4, 0(R8)
-	ADD  $1, R8, R8
-
-inlineEmitLiteralMemmove:
-	// Spill local variables (registers) onto the stack; call; unspill.
-	//
-	// copy(dst[i:], lit)
-	//
-	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// R8, R10 and R3 as arguments.
-	MOVD R8, 8(RSP)
-	MOVD R10, 16(RSP)
-	MOVD R3, 24(RSP)
-
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADD   R3, R8, R8
-	MOVD  R7, 80(RSP)
-	MOVD  R8, 88(RSP)
-	MOVD  R15, 120(RSP)
-	CALL  runtime·memmove(SB)
-	MOVD  64(RSP), R5
-	MOVD  72(RSP), R6
-	MOVD  80(RSP), R7
-	MOVD  88(RSP), R8
-	MOVD  96(RSP), R9
-	MOVD  120(RSP), R15
-	ADD   $128, RSP, R17
-	MOVW  $0xa7bd, R16
-	MOVKW $(0x1e35<<16), R16
-	B     inner1
-
-inlineEmitLiteralEnd:
-	// End inline of the emitLiteral call.
-	// ----------------------------------------
-
-emitLiteralFastPath:
-	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
-	MOVB R3, R4
-	SUBW $1, R4, R4
-	AND  $0xff, R4, R4
-	LSLW $2, R4, R4
-	MOVB R4, (R8)
-	ADD  $1, R8, R8
-
-	// !!! Implement the copy from lit to dst as a 16-byte load and store.
-	// (Encode's documentation says that dst and src must not overlap.)
-	//
-	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
-	// OK. Subsequent iterations will fix up the overrun.
-	//
-	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
-	// 16-byte loads and stores. This technique probably wouldn't be as
-	// effective on architectures that are fussier about alignment.
-	LDP 0(R10), (R0, R1)
-	STP (R0, R1), 0(R8)
-	ADD R3, R8, R8
-
-inner1:
-	// for { etc }
-
-	// base := s
-	MOVD R7, R12
-
-	// !!! offset := base - candidate
-	MOVD R12, R11
-	SUB  R15, R11, R11
-	SUB  R6, R11, R11
-
-	// ----------------------------------------
-	// Begin inline of the extendMatch call.
-	//
-	// s = extendMatch(src, candidate+4, s+4)
-
-	// !!! R14 = &src[len(src)]
-	MOVD src_len+32(FP), R14
-	ADD  R6, R14, R14
-
-	// !!! R13 = &src[len(src) - 8]
-	MOVD R14, R13
-	SUB  $8, R13, R13
-
-	// !!! R15 = &src[candidate + 4]
-	ADD $4, R15, R15
-	ADD R6, R15, R15
-
-	// !!! s += 4
-	ADD $4, R7, R7
-
-inlineExtendMatchCmp8:
-	// As long as we are 8 or more bytes before the end of src, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMP  R13, R7
-	BHI  inlineExtendMatchCmp1
-	MOVD (R15), R3
-	MOVD (R7), R4
-	CMP  R4, R3
-	BNE  inlineExtendMatchBSF
-	ADD  $8, R15, R15
-	ADD  $8, R7, R7
-	B    inlineExtendMatchCmp8
-
-inlineExtendMatchBSF:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs.
-	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
-	// combination of which finds the least significant bit which is set.
-	// The arm64 architecture is little-endian, and the shift by 3 converts
-	// a bit index to a byte index.
-	EOR  R3, R4, R4
-	RBIT R4, R4
-	CLZ  R4, R4
-	ADD  R4>>3, R7, R7
-	B    inlineExtendMatchEnd
-
-inlineExtendMatchCmp1:
-	// In src's tail, compare 1 byte at a time.
-	CMP  R7, R14
-	BLS  inlineExtendMatchEnd
-	MOVB (R15), R3
-	MOVB (R7), R4
-	CMP  R4, R3
-	BNE  inlineExtendMatchEnd
-	ADD  $1, R15, R15
-	ADD  $1, R7, R7
-	B    inlineExtendMatchCmp1
-
-inlineExtendMatchEnd:
-	// End inline of the extendMatch call.
-	// ----------------------------------------
-
-	// ----------------------------------------
-	// Begin inline of the emitCopy call.
-	//
-	// d += emitCopy(dst[d:], base-candidate, s-base)
-
-	// !!! length := s - base
-	MOVD R7, R3
-	SUB  R12, R3, R3
-
-inlineEmitCopyLoop0:
-	// for length >= 68 { etc }
-	MOVW $68, R2
-	CMPW R2, R3
-	BLT  inlineEmitCopyStep1
-
-	// Emit a length 64 copy, encoded as 3 bytes.
-	MOVD $0xfe, R1
-	MOVB R1, 0(R8)
-	MOVW R11, 1(R8)
-	ADD  $3, R8, R8
-	SUBW $64, R3, R3
-	B    inlineEmitCopyLoop0
-
-inlineEmitCopyStep1:
-	// if length > 64 { etc }
-	MOVW $64, R2
-	CMPW R2, R3
-	BLE  inlineEmitCopyStep2
-
-	// Emit a length 60 copy, encoded as 3 bytes.
-	MOVD $0xee, R1
-	MOVB R1, 0(R8)
-	MOVW R11, 1(R8)
-	ADD  $3, R8, R8
-	SUBW $60, R3, R3
-
-inlineEmitCopyStep2:
-	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
-	MOVW $12, R2
-	CMPW R2, R3
-	BGE  inlineEmitCopyStep3
-	MOVW $2048, R2
-	CMPW R2, R11
-	BGE  inlineEmitCopyStep3
-
-	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB R11, 1(R8)
-	LSRW $8, R11, R11
-	LSLW $5, R11, R11
-	SUBW $4, R3, R3
-	AND  $0xff, R3, R3
-	LSLW $2, R3, R3
-	ORRW R3, R11, R11
-	ORRW $1, R11, R11
-	MOVB R11, 0(R8)
-	ADD  $2, R8, R8
-	B    inlineEmitCopyEnd
-
-inlineEmitCopyStep3:
-	// Emit the remaining copy, encoded as 3 bytes.
-	SUBW $1, R3, R3
-	LSLW $2, R3, R3
-	ORRW $2, R3, R3
-	MOVB R3, 0(R8)
-	MOVW R11, 1(R8)
-	ADD  $3, R8, R8
-
-inlineEmitCopyEnd:
-	// End inline of the emitCopy call.
-	// ----------------------------------------
-
-	// nextEmit = s
-	MOVD R7, R10
-
-	// if s >= sLimit { goto emitRemainder }
-	MOVD R7, R3
-	SUB  R6, R3, R3
-	CMP  R3, R9
-	BLS  emitRemainder
-
-	// As per the encode_other.go code:
-	//
-	// We could immediately etc.
-
-	// x := load64(src, s-1)
-	MOVD -1(R7), R14
-
-	// prevHash := hash(uint32(x>>0), shift)
-	MOVW R14, R11
-	MULW R16, R11, R11
-	LSRW R5, R11, R11
-
-	// table[prevHash] = uint16(s-1)
-	MOVD R7, R3
-	SUB  R6, R3, R3
-	SUB  $1, R3, R3
-
-	MOVHU R3, 0(R17)(R11<<1)
-
-	// currHash := hash(uint32(x>>8), shift)
-	LSR  $8, R14, R14
-	MOVW R14, R11
-	MULW R16, R11, R11
-	LSRW R5, R11, R11
-
-	// candidate = int(table[currHash])
-	MOVHU 0(R17)(R11<<1), R15
-
-	// table[currHash] = uint16(s)
-	ADD   $1, R3, R3
-	MOVHU R3, 0(R17)(R11<<1)
-
-	// if uint32(x>>8) == load32(src, candidate) { continue }
-	MOVW (R6)(R15), R4
-	CMPW R4, R14
-	BEQ  inner1
-
-	// nextHash = hash(uint32(x>>16), shift)
-	LSR  $8, R14, R14
-	MOVW R14, R11
-	MULW R16, R11, R11
-	LSRW R5, R11, R11
-
-	// s++
-	ADD $1, R7, R7
-
-	// break out of the inner1 for loop, i.e. continue the outer loop.
-	B outer
-
-emitRemainder:
-	// if nextEmit < len(src) { etc }
-	MOVD src_len+32(FP), R3
-	ADD  R6, R3, R3
-	CMP  R3, R10
-	BEQ  encodeBlockEnd
-
-	// d += emitLiteral(dst[d:], src[nextEmit:])
-	//
-	// Push args.
-	MOVD R8, 8(RSP)
-	MOVD $0, 16(RSP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVD $0, 24(RSP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVD R10, 32(RSP)
-	SUB  R10, R3, R3
-	MOVD R3, 40(RSP)
-	MOVD R3, 48(RSP)  // Unnecessary, as the callee ignores it, but conservative.
-
-	// Spill local variables (registers) onto the stack; call; unspill.
-	MOVD R8, 88(RSP)
-	CALL ·emitLiteral(SB)
-	MOVD 88(RSP), R8
-
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	MOVD 56(RSP), R1
-	ADD  R1, R8, R8
-
-encodeBlockEnd:
-	MOVD dst_base+0(FP), R3
-	SUB  R3, R8, R8
-	MOVD R8, d+48(FP)
-	RET
diff --git a/vendor/github.com/golang/snappy/encode_asm.go b/vendor/github.com/golang/snappy/encode_asm.go
deleted file mode 100644
index 107c1e7141..0000000000
--- a/vendor/github.com/golang/snappy/encode_asm.go
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-// +build amd64 arm64
-
-package snappy
-
-// emitLiteral has the same semantics as in encode_other.go.
-//
-//go:noescape
-func emitLiteral(dst, lit []byte) int
-
-// emitCopy has the same semantics as in encode_other.go.
-//
-//go:noescape
-func emitCopy(dst []byte, offset, length int) int
-
-// extendMatch has the same semantics as in encode_other.go.
-//
-//go:noescape
-func extendMatch(src []byte, i, j int) int
-
-// encodeBlock has the same semantics as in encode_other.go.
-//
-//go:noescape
-func encodeBlock(dst, src []byte) (d int)
diff --git a/vendor/github.com/golang/snappy/snappy.go b/vendor/github.com/golang/snappy/snappy.go
deleted file mode 100644
index ece692ea46..0000000000
--- a/vendor/github.com/golang/snappy/snappy.go
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package snappy implements the Snappy compression format. It aims for very
-// high speeds and reasonable compression.
-//
-// There are actually two Snappy formats: block and stream. They are related,
-// but different: trying to decompress block-compressed data as a Snappy stream
-// will fail, and vice versa. The block format is the Decode and Encode
-// functions and the stream format is the Reader and Writer types.
-//
-// The block format, the more common case, is used when the complete size (the
-// number of bytes) of the original data is known upfront, at the time
-// compression starts. The stream format, also known as the framing format, is
-// for when that isn't always true.
-//
-// The canonical, C++ implementation is at https://github.com/google/snappy and
-// it only implements the block format.
-package snappy // import "github.com/golang/snappy"
-
-import (
-	"hash/crc32"
-)
-
-/*
-Each encoded block begins with the varint-encoded length of the decoded data,
-followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
-first byte of each chunk is broken into its 2 least and 6 most significant bits
-called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
-Zero means a literal tag. All other values mean a copy tag.
-
-For literal tags:
-  - If m < 60, the next 1 + m bytes are literal bytes.
-  - Otherwise, let n be the little-endian unsigned integer denoted by the next
-    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
-
-For copy tags, length bytes are copied from offset bytes ago, in the style of
-Lempel-Ziv compression algorithms. In particular:
-  - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
-    The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
-    of the offset. The next byte is bits 0-7 of the offset.
-  - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
-    The length is 1 + m. The offset is the little-endian unsigned integer
-    denoted by the next 2 bytes.
-  - For l == 3, this tag is a legacy format that is no longer issued by most
-    encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in
-    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
-    integer denoted by the next 4 bytes.
-*/
-const (
-	tagLiteral = 0x00
-	tagCopy1   = 0x01
-	tagCopy2   = 0x02
-	tagCopy4   = 0x03
-)
-
-const (
-	checksumSize    = 4
-	chunkHeaderSize = 4
-	magicChunk      = "\xff\x06\x00\x00" + magicBody
-	magicBody       = "sNaPpY"
-
-	// maxBlockSize is the maximum size of the input to encodeBlock. It is not
-	// part of the wire format per se, but some parts of the encoder assume
-	// that an offset fits into a uint16.
-	//
-	// Also, for the framing format (Writer type instead of Encode function),
-	// https://github.com/google/snappy/blob/master/framing_format.txt says
-	// that "the uncompressed data in a chunk must be no longer than 65536
-	// bytes".
-	maxBlockSize = 65536
-
-	// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
-	// hard coded to be a const instead of a variable, so that obufLen can also
-	// be a const. Their equivalence is confirmed by
-	// TestMaxEncodedLenOfMaxBlockSize.
-	maxEncodedLenOfMaxBlockSize = 76490
-
-	obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
-	obufLen       = obufHeaderLen + maxEncodedLenOfMaxBlockSize
-)
-
-const (
-	chunkTypeCompressedData   = 0x00
-	chunkTypeUncompressedData = 0x01
-	chunkTypePadding          = 0xfe
-	chunkTypeStreamIdentifier = 0xff
-)
-
-var crcTable = crc32.MakeTable(crc32.Castagnoli)
-
-// crc implements the checksum specified in section 3 of
-// https://github.com/google/snappy/blob/master/framing_format.txt
-func crc(b []byte) uint32 {
-	c := crc32.Update(0, crcTable, b)
-	return uint32(c>>15|c<<17) + 0xa282ead8
-}
diff --git a/vendor/github.com/hashicorp/errwrap/LICENSE b/vendor/github.com/hashicorp/errwrap/LICENSE
new file mode 100644
index 0000000000..c33dcc7c92
--- /dev/null
+++ b/vendor/github.com/hashicorp/errwrap/LICENSE
@@ -0,0 +1,354 @@
+Mozilla Public License, version 2.0
+
+1. Definitions
+
+1.1. “Contributor”
+
+     means each individual or legal entity that creates, contributes to the
+     creation of, or owns Covered Software.
+
+1.2. “Contributor Version”
+
+     means the combination of the Contributions of others (if any) used by a
+     Contributor and that particular Contributor’s Contribution.
+
+1.3. “Contribution”
+
+     means Covered Software of a particular Contributor.
+
+1.4. “Covered Software”
+
+     means Source Code Form to which the initial Contributor has attached the
+     notice in Exhibit A, the Executable Form of such Source Code Form, and
+     Modifications of such Source Code Form, in each case including portions
+     thereof.
+
+1.5. “Incompatible With Secondary Licenses”
+     means
+
+     a. that the initial Contributor has attached the notice described in
+        Exhibit B to the Covered Software; or
+
+     b. that the Covered Software was made available under the terms of version
+        1.1 or earlier of the License, but not also under the terms of a
+        Secondary License.
+
+1.6. “Executable Form”
+
+     means any form of the work other than Source Code Form.
+
+1.7. “Larger Work”
+
+     means a work that combines Covered Software with other material, in a separate
+     file or files, that is not Covered Software.
+
+1.8. “License”
+
+     means this document.
+
+1.9. “Licensable”
+
+     means having the right to grant, to the maximum extent possible, whether at the
+     time of the initial grant or subsequently, any and all of the rights conveyed by
+     this License.
+
+1.10. “Modifications”
+
+     means any of the following:
+
+     a. any file in Source Code Form that results from an addition to, deletion
+        from, or modification of the contents of Covered Software; or
+
+     b. any new file in Source Code Form that contains any Covered Software.
+
+1.11. “Patent Claims” of a Contributor
+
+      means any patent claim(s), including without limitation, method, process,
+      and apparatus claims, in any patent Licensable by such Contributor that
+      would be infringed, but for the grant of the License, by the making,
+      using, selling, offering for sale, having made, import, or transfer of
+      either its Contributions or its Contributor Version.
+
+1.12. “Secondary License”
+
+      means either the GNU General Public License, Version 2.0, the GNU Lesser
+      General Public License, Version 2.1, the GNU Affero General Public
+      License, Version 3.0, or any later versions of those licenses.
+
+1.13. “Source Code Form”
+
+      means the form of the work preferred for making modifications.
+
+1.14. “You” (or “Your”)
+
+      means an individual or a legal entity exercising rights under this
+      License. For legal entities, “You” includes any entity that controls, is
+      controlled by, or is under common control with You. For purposes of this
+      definition, “control” means (a) the power, direct or indirect, to cause
+      the direction or management of such entity, whether by contract or
+      otherwise, or (b) ownership of more than fifty percent (50%) of the
+      outstanding shares or beneficial ownership of such entity.
+
+
+2. License Grants and Conditions
+
+2.1. Grants
+
+     Each Contributor hereby grants You a world-wide, royalty-free,
+     non-exclusive license:
+
+     a. under intellectual property rights (other than patent or trademark)
+        Licensable by such Contributor to use, reproduce, make available,
+        modify, display, perform, distribute, and otherwise exploit its
+        Contributions, either on an unmodified basis, with Modifications, or as
+        part of a Larger Work; and
+
+     b. under Patent Claims of such Contributor to make, use, sell, offer for
+        sale, have made, import, and otherwise transfer either its Contributions
+        or its Contributor Version.
+
+2.2. Effective Date
+
+     The licenses granted in Section 2.1 with respect to any Contribution become
+     effective for each Contribution on the date the Contributor first distributes
+     such Contribution.
+
+2.3. Limitations on Grant Scope
+
+     The licenses granted in this Section 2 are the only rights granted under this
+     License. No additional rights or licenses will be implied from the distribution
+     or licensing of Covered Software under this License. Notwithstanding Section
+     2.1(b) above, no patent license is granted by a Contributor:
+
+     a. for any code that a Contributor has removed from Covered Software; or
+
+     b. for infringements caused by: (i) Your and any other third party’s
+        modifications of Covered Software, or (ii) the combination of its
+        Contributions with other software (except as part of its Contributor
+        Version); or
+
+     c. under Patent Claims infringed by Covered Software in the absence of its
+        Contributions.
+
+     This License does not grant any rights in the trademarks, service marks, or
+     logos of any Contributor (except as may be necessary to comply with the
+     notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+     No Contributor makes additional grants as a result of Your choice to
+     distribute the Covered Software under a subsequent version of this License
+     (see Section 10.2) or under the terms of a Secondary License (if permitted
+     under the terms of Section 3.3).
+
+2.5. Representation
+
+     Each Contributor represents that the Contributor believes its Contributions
+     are its original creation(s) or it has sufficient rights to grant the
+     rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+     This License is not intended to limit any rights You have under applicable
+     copyright doctrines of fair use, fair dealing, or other equivalents.
+
+2.7. Conditions
+
+     Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in
+     Section 2.1.
+
+
+3. Responsibilities
+
+3.1. Distribution of Source Form
+
+     All distribution of Covered Software in Source Code Form, including any
+     Modifications that You create or to which You contribute, must be under the
+     terms of this License. You must inform recipients that the Source Code Form
+     of the Covered Software is governed by the terms of this License, and how
+     they can obtain a copy of this License. You may not attempt to alter or
+     restrict the recipients’ rights in the Source Code Form.
+
+3.2. Distribution of Executable Form
+
+     If You distribute Covered Software in Executable Form then:
+
+     a. such Covered Software must also be made available in Source Code Form,
+        as described in Section 3.1, and You must inform recipients of the
+        Executable Form how they can obtain a copy of such Source Code Form by
+        reasonable means in a timely manner, at a charge no more than the cost
+        of distribution to the recipient; and
+
+     b. You may distribute such Executable Form under the terms of this License,
+        or sublicense it under different terms, provided that the license for
+        the Executable Form does not attempt to limit or alter the recipients’
+        rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+     You may create and distribute a Larger Work under terms of Your choice,
+     provided that You also comply with the requirements of this License for the
+     Covered Software. If the Larger Work is a combination of Covered Software
+     with a work governed by one or more Secondary Licenses, and the Covered
+     Software is not Incompatible With Secondary Licenses, this License permits
+     You to additionally distribute such Covered Software under the terms of
+     such Secondary License(s), so that the recipient of the Larger Work may, at
+     their option, further distribute the Covered Software under the terms of
+     either this License or such Secondary License(s).
+
+3.4. Notices
+
+     You may not remove or alter the substance of any license notices (including
+     copyright notices, patent notices, disclaimers of warranty, or limitations
+     of liability) contained within the Source Code Form of the Covered
+     Software, except that You may alter any license notices to the extent
+     required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+     You may choose to offer, and to charge a fee for, warranty, support,
+     indemnity or liability obligations to one or more recipients of Covered
+     Software. However, You may do so only on Your own behalf, and not on behalf
+     of any Contributor. You must make it absolutely clear that any such
+     warranty, support, indemnity, or liability obligation is offered by You
+     alone, and You hereby agree to indemnify every Contributor for any
+     liability incurred by such Contributor as a result of warranty, support,
+     indemnity or liability terms You offer. You may include additional
+     disclaimers of warranty and limitations of liability specific to any
+     jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+
+   If it is impossible for You to comply with any of the terms of this License
+   with respect to some or all of the Covered Software due to statute, judicial
+   order, or regulation then You must: (a) comply with the terms of this License
+   to the maximum extent possible; and (b) describe the limitations and the code
+   they affect. Such description must be placed in a text file included with all
+   distributions of the Covered Software under this License. Except to the
+   extent prohibited by statute or regulation, such description must be
+   sufficiently detailed for a recipient of ordinary skill to be able to
+   understand it.
+
+5. Termination
+
+5.1. The rights granted under this License will terminate automatically if You
+     fail to comply with any of its terms. However, if You become compliant,
+     then the rights granted under this License from a particular Contributor
+     are reinstated (a) provisionally, unless and until such Contributor
+     explicitly and finally terminates Your grants, and (b) on an ongoing basis,
+     if such Contributor fails to notify You of the non-compliance by some
+     reasonable means prior to 60 days after You have come back into compliance.
+     Moreover, Your grants from a particular Contributor are reinstated on an
+     ongoing basis if such Contributor notifies You of the non-compliance by
+     some reasonable means, this is the first time You have received notice of
+     non-compliance with this License from such Contributor, and You become
+     compliant prior to 30 days after Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+     infringement claim (excluding declaratory judgment actions, counter-claims,
+     and cross-claims) alleging that a Contributor Version directly or
+     indirectly infringes any patent, then the rights granted to You by any and
+     all Contributors for the Covered Software under Section 2.1 of this License
+     shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user
+     license agreements (excluding distributors and resellers) which have been
+     validly granted by You or Your distributors under this License prior to
+     termination shall survive termination.
+
+6. Disclaimer of Warranty
+
+   Covered Software is provided under this License on an “as is” basis, without
+   warranty of any kind, either expressed, implied, or statutory, including,
+   without limitation, warranties that the Covered Software is free of defects,
+   merchantable, fit for a particular purpose or non-infringing. The entire
+   risk as to the quality and performance of the Covered Software is with You.
+   Should any Covered Software prove defective in any respect, You (not any
+   Contributor) assume the cost of any necessary servicing, repair, or
+   correction. This disclaimer of warranty constitutes an essential part of this
+   License. No use of  any Covered Software is authorized under this License
+   except under this disclaimer.
+
+7. Limitation of Liability
+
+   Under no circumstances and under no legal theory, whether tort (including
+   negligence), contract, or otherwise, shall any Contributor, or anyone who
+   distributes Covered Software as permitted above, be liable to You for any
+   direct, indirect, special, incidental, or consequential damages of any
+   character including, without limitation, damages for lost profits, loss of
+   goodwill, work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses, even if such party shall have been
+   informed of the possibility of such damages. This limitation of liability
+   shall not apply to liability for death or personal injury resulting from such
+   party’s negligence to the extent applicable law prohibits such limitation.
+   Some jurisdictions do not allow the exclusion or limitation of incidental or
+   consequential damages, so this exclusion and limitation may not apply to You.
+
+8. Litigation
+
+   Any litigation relating to this License may be brought only in the courts of
+   a jurisdiction where the defendant maintains its principal place of business
+   and such litigation shall be governed by laws of that jurisdiction, without
+   reference to its conflict-of-law provisions. Nothing in this Section shall
+   prevent a party’s ability to bring cross-claims or counter-claims.
+
+9. Miscellaneous
+
+   This License represents the complete agreement concerning the subject matter
+   hereof. If any provision of this License is held to be unenforceable, such
+   provision shall be reformed only to the extent necessary to make it
+   enforceable. Any law or regulation which provides that the language of a
+   contract shall be construed against the drafter shall not be used to construe
+   this License against a Contributor.
+
+
+10. Versions of the License
+
+10.1. New Versions
+
+      Mozilla Foundation is the license steward. Except as provided in Section
+      10.3, no one other than the license steward has the right to modify or
+      publish new versions of this License. Each version will be given a
+      distinguishing version number.
+
+10.2. Effect of New Versions
+
+      You may distribute the Covered Software under the terms of the version of
+      the License under which You originally received the Covered Software, or
+      under the terms of any subsequent version published by the license
+      steward.
+
+10.3. Modified Versions
+
+      If you create software not governed by this License, and you want to
+      create a new license for such software, you may create and use a modified
+      version of this License if you rename the license and remove any
+      references to the name of the license steward (except to note that such
+      modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses
+      If You choose to distribute Source Code Form that is Incompatible With
+      Secondary Licenses under the terms of this version of the License, the
+      notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+
+      This Source Code Form is subject to the
+      terms of the Mozilla Public License, v.
+      2.0. If a copy of the MPL was not
+      distributed with this file, You can
+      obtain one at
+      http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular file, then
+You may include the notice in a location (such as a LICENSE file in a relevant
+directory) where a recipient would be likely to look for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - “Incompatible With Secondary Licenses” Notice
+
+      This Source Code Form is “Incompatible
+      With Secondary Licenses”, as defined by
+      the Mozilla Public License, v. 2.0.
+
diff --git a/vendor/github.com/hashicorp/errwrap/README.md b/vendor/github.com/hashicorp/errwrap/README.md
new file mode 100644
index 0000000000..444df08f8e
--- /dev/null
+++ b/vendor/github.com/hashicorp/errwrap/README.md
@@ -0,0 +1,89 @@
+# errwrap
+
+`errwrap` is a package for Go that formalizes the pattern of wrapping errors
+and checking if an error contains another error.
+
+There is a common pattern in Go of taking a returned `error` value and
+then wrapping it (such as with `fmt.Errorf`) before returning it. The problem
+with this pattern is that you completely lose the original `error` structure.
+
+Arguably the _correct_ approach is that you should make a custom structure
+implementing the `error` interface, and have the original error as a field
+on that structure, such [as this example](http://golang.org/pkg/os/#PathError).
+This is a good approach, but you have to know the entire chain of possible
+rewrapping that happens, when you might just care about one.
+
+`errwrap` formalizes this pattern (it doesn't matter what approach you use
+above) by giving a single interface for wrapping errors, checking if a specific
+error is wrapped, and extracting that error.
+
+## Installation and Docs
+
+Install using `go get github.com/hashicorp/errwrap`.
+
+Full documentation is available at
+http://godoc.org/github.com/hashicorp/errwrap
+
+## Usage
+
+#### Basic Usage
+
+Below is a very basic example of its usage:
+
+```go
+// A function that always returns an error, but wraps it, like a real
+// function might.
+func tryOpen() error {
+	_, err := os.Open("/i/dont/exist")
+	if err != nil {
+		return errwrap.Wrapf("Doesn't exist: {{err}}", err)
+	}
+
+	return nil
+}
+
+func main() {
+	err := tryOpen()
+
+	// We can use the Contains helpers to check if an error contains
+	// another error. It is safe to do this with a nil error, or with
+	// an error that doesn't even use the errwrap package.
+	if errwrap.Contains(err, "does not exist") {
+		// Do something
+	}
+	if errwrap.ContainsType(err, new(os.PathError)) {
+		// Do something
+	}
+
+	// Or we can use the associated `Get` functions to just extract
+	// a specific error. This would return nil if that specific error doesn't
+	// exist.
+	perr := errwrap.GetType(err, new(os.PathError))
+}
+```
+
+#### Custom Types
+
+If you're already making custom types that properly wrap errors, then
+you can get all the functionality of `errwraps.Contains` and such by
+implementing the `Wrapper` interface with just one function. Example:
+
+```go
+type AppError {
+  Code ErrorCode
+  Err  error
+}
+
+func (e *AppError) WrappedErrors() []error {
+  return []error{e.Err}
+}
+```
+
+Now this works:
+
+```go
+err := &AppError{Err: fmt.Errorf("an error")}
+if errwrap.ContainsType(err, fmt.Errorf("")) {
+	// This will work!
+}
+```
diff --git a/vendor/github.com/hashicorp/errwrap/errwrap.go b/vendor/github.com/hashicorp/errwrap/errwrap.go
new file mode 100644
index 0000000000..44e368e569
--- /dev/null
+++ b/vendor/github.com/hashicorp/errwrap/errwrap.go
@@ -0,0 +1,178 @@
+// Package errwrap implements methods to formalize error wrapping in Go.
+//
+// All of the top-level functions that take an `error` are built to be able
+// to take any error, not just wrapped errors. This allows you to use errwrap
+// without having to type-check and type-cast everywhere.
+package errwrap
+
+import (
+	"errors"
+	"reflect"
+	"strings"
+)
+
+// WalkFunc is the callback called for Walk.
+type WalkFunc func(error)
+
+// Wrapper is an interface that can be implemented by custom types to
+// have all the Contains, Get, etc. functions in errwrap work.
+//
+// When Walk reaches a Wrapper, it will call the callback for every
+// wrapped error in addition to the wrapper itself. Since all the top-level
+// functions in errwrap use Walk, this means that all those functions work
+// with your custom type.
+type Wrapper interface {
+	WrappedErrors() []error
+}
+
+// Wrap defines that outer wraps inner, returning an error type that
+// can be cleanly used with the other methods in this package, such as
+// Contains, GetAll, etc.
+//
+// This function won't modify the error message at all (the outer message
+// will be used).
+func Wrap(outer, inner error) error {
+	return &wrappedError{
+		Outer: outer,
+		Inner: inner,
+	}
+}
+
+// Wrapf wraps an error with a formatting message. This is similar to using
+// `fmt.Errorf` to wrap an error. If you're using `fmt.Errorf` to wrap
+// errors, you should replace it with this.
+//
+// format is the format of the error message. The string '{{err}}' will
+// be replaced with the original error message.
+//
+// Deprecated: Use fmt.Errorf()
+func Wrapf(format string, err error) error {
+	outerMsg := "<nil>"
+	if err != nil {
+		outerMsg = err.Error()
+	}
+
+	outer := errors.New(strings.Replace(
+		format, "{{err}}", outerMsg, -1))
+
+	return Wrap(outer, err)
+}
+
+// Contains checks if the given error contains an error with the
+// message msg. If err is not a wrapped error, this will always return
+// false unless the error itself happens to match this msg.
+func Contains(err error, msg string) bool {
+	return len(GetAll(err, msg)) > 0
+}
+
+// ContainsType checks if the given error contains an error with
+// the same concrete type as v. If err is not a wrapped error, this will
+// check the err itself.
+func ContainsType(err error, v interface{}) bool {
+	return len(GetAllType(err, v)) > 0
+}
+
+// Get is the same as GetAll but returns the deepest matching error.
+func Get(err error, msg string) error {
+	es := GetAll(err, msg)
+	if len(es) > 0 {
+		return es[len(es)-1]
+	}
+
+	return nil
+}
+
+// GetType is the same as GetAllType but returns the deepest matching error.
+func GetType(err error, v interface{}) error {
+	es := GetAllType(err, v)
+	if len(es) > 0 {
+		return es[len(es)-1]
+	}
+
+	return nil
+}
+
+// GetAll gets all the errors that might be wrapped in err with the
+// given message. The order of the errors is such that the outermost
+// matching error (the most recent wrap) is index zero, and so on.
+func GetAll(err error, msg string) []error {
+	var result []error
+
+	Walk(err, func(err error) {
+		if err.Error() == msg {
+			result = append(result, err)
+		}
+	})
+
+	return result
+}
+
+// GetAllType gets all the errors that are the same type as v.
+//
+// The order of the return value is the same as described in GetAll.
+func GetAllType(err error, v interface{}) []error {
+	var result []error
+
+	var search string
+	if v != nil {
+		search = reflect.TypeOf(v).String()
+	}
+	Walk(err, func(err error) {
+		var needle string
+		if err != nil {
+			needle = reflect.TypeOf(err).String()
+		}
+
+		if needle == search {
+			result = append(result, err)
+		}
+	})
+
+	return result
+}
+
+// Walk walks all the wrapped errors in err and calls the callback. If
+// err isn't a wrapped error, this will be called once for err. If err
+// is a wrapped error, the callback will be called for both the wrapper
+// that implements error as well as the wrapped error itself.
+func Walk(err error, cb WalkFunc) {
+	if err == nil {
+		return
+	}
+
+	switch e := err.(type) {
+	case *wrappedError:
+		cb(e.Outer)
+		Walk(e.Inner, cb)
+	case Wrapper:
+		cb(err)
+
+		for _, err := range e.WrappedErrors() {
+			Walk(err, cb)
+		}
+	case interface{ Unwrap() error }:
+		cb(err)
+		Walk(e.Unwrap(), cb)
+	default:
+		cb(err)
+	}
+}
+
+// wrappedError is an implementation of error that has both the
+// outer and inner errors.
+type wrappedError struct {
+	Outer error
+	Inner error
+}
+
+func (w *wrappedError) Error() string {
+	return w.Outer.Error()
+}
+
+func (w *wrappedError) WrappedErrors() []error {
+	return []error{w.Outer, w.Inner}
+}
+
+func (w *wrappedError) Unwrap() error {
+	return w.Inner
+}
diff --git a/vendor/github.com/hashicorp/go-multierror/LICENSE b/vendor/github.com/hashicorp/go-multierror/LICENSE
new file mode 100644
index 0000000000..82b4de97c7
--- /dev/null
+++ b/vendor/github.com/hashicorp/go-multierror/LICENSE
@@ -0,0 +1,353 @@
+Mozilla Public License, version 2.0
+
+1. Definitions
+
+1.1. “Contributor”
+
+     means each individual or legal entity that creates, contributes to the
+     creation of, or owns Covered Software.
+
+1.2. “Contributor Version”
+
+     means the combination of the Contributions of others (if any) used by a
+     Contributor and that particular Contributor’s Contribution.
+
+1.3. “Contribution”
+
+     means Covered Software of a particular Contributor.
+
+1.4. “Covered Software”
+
+     means Source Code Form to which the initial Contributor has attached the
+     notice in Exhibit A, the Executable Form of such Source Code Form, and
+     Modifications of such Source Code Form, in each case including portions
+     thereof.
+
+1.5. “Incompatible With Secondary Licenses”
+     means
+
+     a. that the initial Contributor has attached the notice described in
+        Exhibit B to the Covered Software; or
+
+     b. that the Covered Software was made available under the terms of version
+        1.1 or earlier of the License, but not also under the terms of a
+        Secondary License.
+
+1.6. “Executable Form”
+
+     means any form of the work other than Source Code Form.
+
+1.7. “Larger Work”
+
+     means a work that combines Covered Software with other material, in a separate
+     file or files, that is not Covered Software.
+
+1.8. “License”
+
+     means this document.
+
+1.9. “Licensable”
+
+     means having the right to grant, to the maximum extent possible, whether at the
+     time of the initial grant or subsequently, any and all of the rights conveyed by
+     this License.
+
+1.10. “Modifications”
+
+     means any of the following:
+
+     a. any file in Source Code Form that results from an addition to, deletion
+        from, or modification of the contents of Covered Software; or
+
+     b. any new file in Source Code Form that contains any Covered Software.
+
+1.11. “Patent Claims” of a Contributor
+
+      means any patent claim(s), including without limitation, method, process,
+      and apparatus claims, in any patent Licensable by such Contributor that
+      would be infringed, but for the grant of the License, by the making,
+      using, selling, offering for sale, having made, import, or transfer of
+      either its Contributions or its Contributor Version.
+
+1.12. “Secondary License”
+
+      means either the GNU General Public License, Version 2.0, the GNU Lesser
+      General Public License, Version 2.1, the GNU Affero General Public
+      License, Version 3.0, or any later versions of those licenses.
+
+1.13. “Source Code Form”
+
+      means the form of the work preferred for making modifications.
+
+1.14. “You” (or “Your”)
+
+      means an individual or a legal entity exercising rights under this
+      License. For legal entities, “You” includes any entity that controls, is
+      controlled by, or is under common control with You. For purposes of this
+      definition, “control” means (a) the power, direct or indirect, to cause
+      the direction or management of such entity, whether by contract or
+      otherwise, or (b) ownership of more than fifty percent (50%) of the
+      outstanding shares or beneficial ownership of such entity.
+
+
+2. License Grants and Conditions
+
+2.1. Grants
+
+     Each Contributor hereby grants You a world-wide, royalty-free,
+     non-exclusive license:
+
+     a. under intellectual property rights (other than patent or trademark)
+        Licensable by such Contributor to use, reproduce, make available,
+        modify, display, perform, distribute, and otherwise exploit its
+        Contributions, either on an unmodified basis, with Modifications, or as
+        part of a Larger Work; and
+
+     b. under Patent Claims of such Contributor to make, use, sell, offer for
+        sale, have made, import, and otherwise transfer either its Contributions
+        or its Contributor Version.
+
+2.2. Effective Date
+
+     The licenses granted in Section 2.1 with respect to any Contribution become
+     effective for each Contribution on the date the Contributor first distributes
+     such Contribution.
+
+2.3. Limitations on Grant Scope
+
+     The licenses granted in this Section 2 are the only rights granted under this
+     License. No additional rights or licenses will be implied from the distribution
+     or licensing of Covered Software under this License. Notwithstanding Section
+     2.1(b) above, no patent license is granted by a Contributor:
+
+     a. for any code that a Contributor has removed from Covered Software; or
+
+     b. for infringements caused by: (i) Your and any other third party’s
+        modifications of Covered Software, or (ii) the combination of its
+        Contributions with other software (except as part of its Contributor
+        Version); or
+
+     c. under Patent Claims infringed by Covered Software in the absence of its
+        Contributions.
+
+     This License does not grant any rights in the trademarks, service marks, or
+     logos of any Contributor (except as may be necessary to comply with the
+     notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+     No Contributor makes additional grants as a result of Your choice to
+     distribute the Covered Software under a subsequent version of this License
+     (see Section 10.2) or under the terms of a Secondary License (if permitted
+     under the terms of Section 3.3).
+
+2.5. Representation
+
+     Each Contributor represents that the Contributor believes its Contributions
+     are its original creation(s) or it has sufficient rights to grant the
+     rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+     This License is not intended to limit any rights You have under applicable
+     copyright doctrines of fair use, fair dealing, or other equivalents.
+
+2.7. Conditions
+
+     Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in
+     Section 2.1.
+
+
+3. Responsibilities
+
+3.1. Distribution of Source Form
+
+     All distribution of Covered Software in Source Code Form, including any
+     Modifications that You create or to which You contribute, must be under the
+     terms of this License. You must inform recipients that the Source Code Form
+     of the Covered Software is governed by the terms of this License, and how
+     they can obtain a copy of this License. You may not attempt to alter or
+     restrict the recipients’ rights in the Source Code Form.
+
+3.2. Distribution of Executable Form
+
+     If You distribute Covered Software in Executable Form then:
+
+     a. such Covered Software must also be made available in Source Code Form,
+        as described in Section 3.1, and You must inform recipients of the
+        Executable Form how they can obtain a copy of such Source Code Form by
+        reasonable means in a timely manner, at a charge no more than the cost
+        of distribution to the recipient; and
+
+     b. You may distribute such Executable Form under the terms of this License,
+        or sublicense it under different terms, provided that the license for
+        the Executable Form does not attempt to limit or alter the recipients’
+        rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+     You may create and distribute a Larger Work under terms of Your choice,
+     provided that You also comply with the requirements of this License for the
+     Covered Software. If the Larger Work is a combination of Covered Software
+     with a work governed by one or more Secondary Licenses, and the Covered
+     Software is not Incompatible With Secondary Licenses, this License permits
+     You to additionally distribute such Covered Software under the terms of
+     such Secondary License(s), so that the recipient of the Larger Work may, at
+     their option, further distribute the Covered Software under the terms of
+     either this License or such Secondary License(s).
+
+3.4. Notices
+
+     You may not remove or alter the substance of any license notices (including
+     copyright notices, patent notices, disclaimers of warranty, or limitations
+     of liability) contained within the Source Code Form of the Covered
+     Software, except that You may alter any license notices to the extent
+     required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+     You may choose to offer, and to charge a fee for, warranty, support,
+     indemnity or liability obligations to one or more recipients of Covered
+     Software. However, You may do so only on Your own behalf, and not on behalf
+     of any Contributor. You must make it absolutely clear that any such
+     warranty, support, indemnity, or liability obligation is offered by You
+     alone, and You hereby agree to indemnify every Contributor for any
+     liability incurred by such Contributor as a result of warranty, support,
+     indemnity or liability terms You offer. You may include additional
+     disclaimers of warranty and limitations of liability specific to any
+     jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+
+   If it is impossible for You to comply with any of the terms of this License
+   with respect to some or all of the Covered Software due to statute, judicial
+   order, or regulation then You must: (a) comply with the terms of this License
+   to the maximum extent possible; and (b) describe the limitations and the code
+   they affect. Such description must be placed in a text file included with all
+   distributions of the Covered Software under this License. Except to the
+   extent prohibited by statute or regulation, such description must be
+   sufficiently detailed for a recipient of ordinary skill to be able to
+   understand it.
+
+5. Termination
+
+5.1. The rights granted under this License will terminate automatically if You
+     fail to comply with any of its terms. However, if You become compliant,
+     then the rights granted under this License from a particular Contributor
+     are reinstated (a) provisionally, unless and until such Contributor
+     explicitly and finally terminates Your grants, and (b) on an ongoing basis,
+     if such Contributor fails to notify You of the non-compliance by some
+     reasonable means prior to 60 days after You have come back into compliance.
+     Moreover, Your grants from a particular Contributor are reinstated on an
+     ongoing basis if such Contributor notifies You of the non-compliance by
+     some reasonable means, this is the first time You have received notice of
+     non-compliance with this License from such Contributor, and You become
+     compliant prior to 30 days after Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+     infringement claim (excluding declaratory judgment actions, counter-claims,
+     and cross-claims) alleging that a Contributor Version directly or
+     indirectly infringes any patent, then the rights granted to You by any and
+     all Contributors for the Covered Software under Section 2.1 of this License
+     shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user
+     license agreements (excluding distributors and resellers) which have been
+     validly granted by You or Your distributors under this License prior to
+     termination shall survive termination.
+
+6. Disclaimer of Warranty
+
+   Covered Software is provided under this License on an “as is” basis, without
+   warranty of any kind, either expressed, implied, or statutory, including,
+   without limitation, warranties that the Covered Software is free of defects,
+   merchantable, fit for a particular purpose or non-infringing. The entire
+   risk as to the quality and performance of the Covered Software is with You.
+   Should any Covered Software prove defective in any respect, You (not any
+   Contributor) assume the cost of any necessary servicing, repair, or
+   correction. This disclaimer of warranty constitutes an essential part of this
+   License. No use of  any Covered Software is authorized under this License
+   except under this disclaimer.
+
+7. Limitation of Liability
+
+   Under no circumstances and under no legal theory, whether tort (including
+   negligence), contract, or otherwise, shall any Contributor, or anyone who
+   distributes Covered Software as permitted above, be liable to You for any
+   direct, indirect, special, incidental, or consequential damages of any
+   character including, without limitation, damages for lost profits, loss of
+   goodwill, work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses, even if such party shall have been
+   informed of the possibility of such damages. This limitation of liability
+   shall not apply to liability for death or personal injury resulting from such
+   party’s negligence to the extent applicable law prohibits such limitation.
+   Some jurisdictions do not allow the exclusion or limitation of incidental or
+   consequential damages, so this exclusion and limitation may not apply to You.
+
+8. Litigation
+
+   Any litigation relating to this License may be brought only in the courts of
+   a jurisdiction where the defendant maintains its principal place of business
+   and such litigation shall be governed by laws of that jurisdiction, without
+   reference to its conflict-of-law provisions. Nothing in this Section shall
+   prevent a party’s ability to bring cross-claims or counter-claims.
+
+9. Miscellaneous
+
+   This License represents the complete agreement concerning the subject matter
+   hereof. If any provision of this License is held to be unenforceable, such
+   provision shall be reformed only to the extent necessary to make it
+   enforceable. Any law or regulation which provides that the language of a
+   contract shall be construed against the drafter shall not be used to construe
+   this License against a Contributor.
+
+
+10. Versions of the License
+
+10.1. New Versions
+
+      Mozilla Foundation is the license steward. Except as provided in Section
+      10.3, no one other than the license steward has the right to modify or
+      publish new versions of this License. Each version will be given a
+      distinguishing version number.
+
+10.2. Effect of New Versions
+
+      You may distribute the Covered Software under the terms of the version of
+      the License under which You originally received the Covered Software, or
+      under the terms of any subsequent version published by the license
+      steward.
+
+10.3. Modified Versions
+
+      If you create software not governed by this License, and you want to
+      create a new license for such software, you may create and use a modified
+      version of this License if you rename the license and remove any
+      references to the name of the license steward (except to note that such
+      modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses
+      If You choose to distribute Source Code Form that is Incompatible With
+      Secondary Licenses under the terms of this version of the License, the
+      notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+
+      This Source Code Form is subject to the
+      terms of the Mozilla Public License, v.
+      2.0. If a copy of the MPL was not
+      distributed with this file, You can
+      obtain one at
+      http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular file, then
+You may include the notice in a location (such as a LICENSE file in a relevant
+directory) where a recipient would be likely to look for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - “Incompatible With Secondary Licenses” Notice
+
+      This Source Code Form is “Incompatible
+      With Secondary Licenses”, as defined by
+      the Mozilla Public License, v. 2.0.
diff --git a/vendor/github.com/hashicorp/go-multierror/Makefile b/vendor/github.com/hashicorp/go-multierror/Makefile
new file mode 100644
index 0000000000..b97cd6ed02
--- /dev/null
+++ b/vendor/github.com/hashicorp/go-multierror/Makefile
@@ -0,0 +1,31 @@
+TEST?=./...
+
+default: test
+
+# test runs the test suite and vets the code.
+test: generate
+	@echo "==> Running tests..."
+	@go list $(TEST) \
+		| grep -v "/vendor/" \
+		| xargs -n1 go test -timeout=60s -parallel=10 ${TESTARGS}
+
+# testrace runs the race checker
+testrace: generate
+	@echo "==> Running tests (race)..."
+	@go list $(TEST) \
+		| grep -v "/vendor/" \
+		| xargs -n1 go test -timeout=60s -race ${TESTARGS}
+
+# updatedeps installs all the dependencies needed to run and build.
+updatedeps:
+	@sh -c "'${CURDIR}/scripts/deps.sh' '${NAME}'"
+
+# generate runs `go generate` to build the dynamically generated source files.
+generate:
+	@echo "==> Generating..."
+	@find . -type f -name '.DS_Store' -delete
+	@go list ./... \
+		| grep -v "/vendor/" \
+		| xargs -n1 go generate
+
+.PHONY: default test testrace updatedeps generate
diff --git a/vendor/github.com/hashicorp/go-multierror/README.md b/vendor/github.com/hashicorp/go-multierror/README.md
new file mode 100644
index 0000000000..71dd308ed8
--- /dev/null
+++ b/vendor/github.com/hashicorp/go-multierror/README.md
@@ -0,0 +1,150 @@
+# go-multierror
+
+[![CircleCI](https://img.shields.io/circleci/build/github/hashicorp/go-multierror/master)](https://circleci.com/gh/hashicorp/go-multierror)
+[![Go Reference](https://pkg.go.dev/badge/github.com/hashicorp/go-multierror.svg)](https://pkg.go.dev/github.com/hashicorp/go-multierror)
+![GitHub go.mod Go version](https://img.shields.io/github/go-mod/go-version/hashicorp/go-multierror)
+
+[circleci]: https://app.circleci.com/pipelines/github/hashicorp/go-multierror
+[godocs]: https://pkg.go.dev/github.com/hashicorp/go-multierror
+
+`go-multierror` is a package for Go that provides a mechanism for
+representing a list of `error` values as a single `error`.
+
+This allows a function in Go to return an `error` that might actually
+be a list of errors. If the caller knows this, they can unwrap the
+list and access the errors. If the caller doesn't know, the error
+formats to a nice human-readable format.
+
+`go-multierror` is fully compatible with the Go standard library
+[errors](https://golang.org/pkg/errors/) package, including the
+functions `As`, `Is`, and `Unwrap`. This provides a standardized approach
+for introspecting on error values.
+
+## Installation and Docs
+
+Install using `go get github.com/hashicorp/go-multierror`.
+
+Full documentation is available at
+https://pkg.go.dev/github.com/hashicorp/go-multierror
+
+### Requires go version 1.13 or newer
+
+`go-multierror` requires go version 1.13 or newer. Go 1.13 introduced
+[error wrapping](https://golang.org/doc/go1.13#error_wrapping), which
+this library takes advantage of.
+
+If you need to use an earlier version of go, you can use the
+[v1.0.0](https://github.com/hashicorp/go-multierror/tree/v1.0.0)
+tag, which doesn't rely on features in go 1.13.
+
+If you see compile errors that look like the below, it's likely that
+you're on an older version of go:
+
+```
+/go/src/github.com/hashicorp/go-multierror/multierror.go:112:9: undefined: errors.As
+/go/src/github.com/hashicorp/go-multierror/multierror.go:117:9: undefined: errors.Is
+```
+
+## Usage
+
+go-multierror is easy to use and purposely built to be unobtrusive in
+existing Go applications/libraries that may not be aware of it.
+
+**Building a list of errors**
+
+The `Append` function is used to create a list of errors. This function
+behaves a lot like the Go built-in `append` function: it doesn't matter
+if the first argument is nil, a `multierror.Error`, or any other `error`,
+the function behaves as you would expect.
+
+```go
+var result error
+
+if err := step1(); err != nil {
+	result = multierror.Append(result, err)
+}
+if err := step2(); err != nil {
+	result = multierror.Append(result, err)
+}
+
+return result
+```
+
+**Customizing the formatting of the errors**
+
+By specifying a custom `ErrorFormat`, you can customize the format
+of the `Error() string` function:
+
+```go
+var result *multierror.Error
+
+// ... accumulate errors here, maybe using Append
+
+if result != nil {
+	result.ErrorFormat = func([]error) string {
+		return "errors!"
+	}
+}
+```
+
+**Accessing the list of errors**
+
+`multierror.Error` implements `error` so if the caller doesn't know about
+multierror, it will work just fine. But if you're aware a multierror might
+be returned, you can use type switches to access the list of errors:
+
+```go
+if err := something(); err != nil {
+	if merr, ok := err.(*multierror.Error); ok {
+		// Use merr.Errors
+	}
+}
+```
+
+You can also use the standard [`errors.Unwrap`](https://golang.org/pkg/errors/#Unwrap)
+function. This will continue to unwrap into subsequent errors until none exist.
+
+**Extracting an error**
+
+The standard library [`errors.As`](https://golang.org/pkg/errors/#As)
+function can be used directly with a multierror to extract a specific error:
+
+```go
+// Assume err is a multierror value
+err := somefunc()
+
+// We want to know if "err" has a "RichErrorType" in it and extract it.
+var errRich RichErrorType
+if errors.As(err, &errRich) {
+	// It has it, and now errRich is populated.
+}
+```
+
+**Checking for an exact error value**
+
+Some errors are returned as exact errors such as the [`ErrNotExist`](https://golang.org/pkg/os/#pkg-variables)
+error in the `os` package. You can check if this error is present by using
+the standard [`errors.Is`](https://golang.org/pkg/errors/#Is) function.
+
+```go
+// Assume err is a multierror value
+err := somefunc()
+if errors.Is(err, os.ErrNotExist) {
+	// err contains os.ErrNotExist
+}
+```
+
+**Returning a multierror only if there are errors**
+
+If you build a `multierror.Error`, you can use the `ErrorOrNil` function
+to return an `error` implementation only if there are errors to return:
+
+```go
+var result *multierror.Error
+
+// ... accumulate errors here
+
+// Return the `error` only if errors were added to the multierror, otherwise
+// return nil since there are no errors.
+return result.ErrorOrNil()
+```
diff --git a/vendor/github.com/hashicorp/go-multierror/append.go b/vendor/github.com/hashicorp/go-multierror/append.go
new file mode 100644
index 0000000000..3e2589bfde
--- /dev/null
+++ b/vendor/github.com/hashicorp/go-multierror/append.go
@@ -0,0 +1,43 @@
+package multierror
+
+// Append is a helper function that will append more errors
+// onto an Error in order to create a larger multi-error.
+//
+// If err is not a multierror.Error, then it will be turned into
+// one. If any of the errs are multierr.Error, they will be flattened
+// one level into err.
+// Any nil errors within errs will be ignored. If err is nil, a new
+// *Error will be returned.
+func Append(err error, errs ...error) *Error {
+	switch err := err.(type) {
+	case *Error:
+		// Typed nils can reach here, so initialize if we are nil
+		if err == nil {
+			err = new(Error)
+		}
+
+		// Go through each error and flatten
+		for _, e := range errs {
+			switch e := e.(type) {
+			case *Error:
+				if e != nil {
+					err.Errors = append(err.Errors, e.Errors...)
+				}
+			default:
+				if e != nil {
+					err.Errors = append(err.Errors, e)
+				}
+			}
+		}
+
+		return err
+	default:
+		newErrs := make([]error, 0, len(errs)+1)
+		if err != nil {
+			newErrs = append(newErrs, err)
+		}
+		newErrs = append(newErrs, errs...)
+
+		return Append(&Error{}, newErrs...)
+	}
+}
diff --git a/vendor/github.com/hashicorp/go-multierror/flatten.go b/vendor/github.com/hashicorp/go-multierror/flatten.go
new file mode 100644
index 0000000000..aab8e9abec
--- /dev/null
+++ b/vendor/github.com/hashicorp/go-multierror/flatten.go
@@ -0,0 +1,26 @@
+package multierror
+
+// Flatten flattens the given error, merging any *Errors together into
+// a single *Error.
+func Flatten(err error) error {
+	// If it isn't an *Error, just return the error as-is
+	if _, ok := err.(*Error); !ok {
+		return err
+	}
+
+	// Otherwise, make the result and flatten away!
+	flatErr := new(Error)
+	flatten(err, flatErr)
+	return flatErr
+}
+
+func flatten(err error, flatErr *Error) {
+	switch err := err.(type) {
+	case *Error:
+		for _, e := range err.Errors {
+			flatten(e, flatErr)
+		}
+	default:
+		flatErr.Errors = append(flatErr.Errors, err)
+	}
+}
diff --git a/vendor/github.com/hashicorp/go-multierror/format.go b/vendor/github.com/hashicorp/go-multierror/format.go
new file mode 100644
index 0000000000..47f13c49a6
--- /dev/null
+++ b/vendor/github.com/hashicorp/go-multierror/format.go
@@ -0,0 +1,27 @@
+package multierror
+
+import (
+	"fmt"
+	"strings"
+)
+
+// ErrorFormatFunc is a function callback that is called by Error to
+// turn the list of errors into a string.
+type ErrorFormatFunc func([]error) string
+
+// ListFormatFunc is a basic formatter that outputs the number of errors
+// that occurred along with a bullet point list of the errors.
+func ListFormatFunc(es []error) string {
+	if len(es) == 1 {
+		return fmt.Sprintf("1 error occurred:\n\t* %s\n\n", es[0])
+	}
+
+	points := make([]string, len(es))
+	for i, err := range es {
+		points[i] = fmt.Sprintf("* %s", err)
+	}
+
+	return fmt.Sprintf(
+		"%d errors occurred:\n\t%s\n\n",
+		len(es), strings.Join(points, "\n\t"))
+}
diff --git a/vendor/github.com/hashicorp/go-multierror/group.go b/vendor/github.com/hashicorp/go-multierror/group.go
new file mode 100644
index 0000000000..9c29efb7f8
--- /dev/null
+++ b/vendor/github.com/hashicorp/go-multierror/group.go
@@ -0,0 +1,38 @@
+package multierror
+
+import "sync"
+
+// Group is a collection of goroutines which return errors that need to be
+// coalesced.
+type Group struct {
+	mutex sync.Mutex
+	err   *Error
+	wg    sync.WaitGroup
+}
+
+// Go calls the given function in a new goroutine.
+//
+// If the function returns an error it is added to the group multierror which
+// is returned by Wait.
+func (g *Group) Go(f func() error) {
+	g.wg.Add(1)
+
+	go func() {
+		defer g.wg.Done()
+
+		if err := f(); err != nil {
+			g.mutex.Lock()
+			g.err = Append(g.err, err)
+			g.mutex.Unlock()
+		}
+	}()
+}
+
+// Wait blocks until all function calls from the Go method have returned, then
+// returns the multierror.
+func (g *Group) Wait() *Error {
+	g.wg.Wait()
+	g.mutex.Lock()
+	defer g.mutex.Unlock()
+	return g.err
+}
diff --git a/vendor/github.com/hashicorp/go-multierror/multierror.go b/vendor/github.com/hashicorp/go-multierror/multierror.go
new file mode 100644
index 0000000000..f545743264
--- /dev/null
+++ b/vendor/github.com/hashicorp/go-multierror/multierror.go
@@ -0,0 +1,121 @@
+package multierror
+
+import (
+	"errors"
+	"fmt"
+)
+
+// Error is an error type to track multiple errors. This is used to
+// accumulate errors in cases and return them as a single "error".
+type Error struct {
+	Errors      []error
+	ErrorFormat ErrorFormatFunc
+}
+
+func (e *Error) Error() string {
+	fn := e.ErrorFormat
+	if fn == nil {
+		fn = ListFormatFunc
+	}
+
+	return fn(e.Errors)
+}
+
+// ErrorOrNil returns an error interface if this Error represents
+// a list of errors, or returns nil if the list of errors is empty. This
+// function is useful at the end of accumulation to make sure that the value
+// returned represents the existence of errors.
+func (e *Error) ErrorOrNil() error {
+	if e == nil {
+		return nil
+	}
+	if len(e.Errors) == 0 {
+		return nil
+	}
+
+	return e
+}
+
+func (e *Error) GoString() string {
+	return fmt.Sprintf("*%#v", *e)
+}
+
+// WrappedErrors returns the list of errors that this Error is wrapping. It is
+// an implementation of the errwrap.Wrapper interface so that multierror.Error
+// can be used with that library.
+//
+// This method is not safe to be called concurrently. Unlike accessing the
+// Errors field directly, this function also checks if the multierror is nil to
+// prevent a null-pointer panic. It satisfies the errwrap.Wrapper interface.
+func (e *Error) WrappedErrors() []error {
+	if e == nil {
+		return nil
+	}
+	return e.Errors
+}
+
+// Unwrap returns an error from Error (or nil if there are no errors).
+// This error returned will further support Unwrap to get the next error,
+// etc. The order will match the order of Errors in the multierror.Error
+// at the time of calling.
+//
+// The resulting error supports errors.As/Is/Unwrap so you can continue
+// to use the stdlib errors package to introspect further.
+//
+// This will perform a shallow copy of the errors slice. Any errors appended
+// to this error after calling Unwrap will not be available until a new
+// Unwrap is called on the multierror.Error.
+func (e *Error) Unwrap() error {
+	// If we have no errors then we do nothing
+	if e == nil || len(e.Errors) == 0 {
+		return nil
+	}
+
+	// If we have exactly one error, we can just return that directly.
+	if len(e.Errors) == 1 {
+		return e.Errors[0]
+	}
+
+	// Shallow copy the slice
+	errs := make([]error, len(e.Errors))
+	copy(errs, e.Errors)
+	return chain(errs)
+}
+
+// chain implements the interfaces necessary for errors.Is/As/Unwrap to
+// work in a deterministic way with multierror. A chain tracks a list of
+// errors while accounting for the current represented error. This lets
+// Is/As be meaningful.
+//
+// Unwrap returns the next error. In the cleanest form, Unwrap would return
+// the wrapped error here but we can't do that if we want to properly
+// get access to all the errors. Instead, users are recommended to use
+// Is/As to get the correct error type out.
+//
+// Precondition: []error is non-empty (len > 0)
+type chain []error
+
+// Error implements the error interface
+func (e chain) Error() string {
+	return e[0].Error()
+}
+
+// Unwrap implements errors.Unwrap by returning the next error in the
+// chain or nil if there are no more errors.
+func (e chain) Unwrap() error {
+	if len(e) == 1 {
+		return nil
+	}
+
+	return e[1:]
+}
+
+// As implements errors.As by attempting to map to the current value.
+func (e chain) As(target interface{}) bool {
+	return errors.As(e[0], target)
+}
+
+// Is implements errors.Is by comparing the current value directly.
+func (e chain) Is(target error) bool {
+	return errors.Is(e[0], target)
+}
diff --git a/vendor/github.com/hashicorp/go-multierror/prefix.go b/vendor/github.com/hashicorp/go-multierror/prefix.go
new file mode 100644
index 0000000000..5c477abe44
--- /dev/null
+++ b/vendor/github.com/hashicorp/go-multierror/prefix.go
@@ -0,0 +1,37 @@
+package multierror
+
+import (
+	"fmt"
+
+	"github.com/hashicorp/errwrap"
+)
+
+// Prefix is a helper function that will prefix some text
+// to the given error. If the error is a multierror.Error, then
+// it will be prefixed to each wrapped error.
+//
+// This is useful to use when appending multiple multierrors
+// together in order to give better scoping.
+func Prefix(err error, prefix string) error {
+	if err == nil {
+		return nil
+	}
+
+	format := fmt.Sprintf("%s {{err}}", prefix)
+	switch err := err.(type) {
+	case *Error:
+		// Typed nils can reach here, so initialize if we are nil
+		if err == nil {
+			err = new(Error)
+		}
+
+		// Wrap each of the errors
+		for i, e := range err.Errors {
+			err.Errors[i] = errwrap.Wrapf(format, e)
+		}
+
+		return err
+	default:
+		return errwrap.Wrapf(format, err)
+	}
+}
diff --git a/vendor/github.com/hashicorp/go-multierror/sort.go b/vendor/github.com/hashicorp/go-multierror/sort.go
new file mode 100644
index 0000000000..fecb14e81c
--- /dev/null
+++ b/vendor/github.com/hashicorp/go-multierror/sort.go
@@ -0,0 +1,16 @@
+package multierror
+
+// Len implements sort.Interface function for length
+func (err Error) Len() int {
+	return len(err.Errors)
+}
+
+// Swap implements sort.Interface function for swapping elements
+func (err Error) Swap(i, j int) {
+	err.Errors[i], err.Errors[j] = err.Errors[j], err.Errors[i]
+}
+
+// Less implements sort.Interface function for determining order
+func (err Error) Less(i, j int) bool {
+	return err.Errors[i].Error() < err.Errors[j].Error()
+}
diff --git a/vendor/github.com/klauspost/compress/.gitattributes b/vendor/github.com/klauspost/compress/.gitattributes
new file mode 100644
index 0000000000..402433593c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/.gitattributes
@@ -0,0 +1,2 @@
+* -text
+*.bin -text -diff
diff --git a/vendor/github.com/klauspost/compress/.gitignore b/vendor/github.com/klauspost/compress/.gitignore
new file mode 100644
index 0000000000..d31b378152
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/.gitignore
@@ -0,0 +1,32 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
+*.prof
+/s2/cmd/_s2sx/sfx-exe
+
+# Linux perf files
+perf.data
+perf.data.old
+
+# gdb history
+.gdb_history
diff --git a/vendor/github.com/klauspost/compress/.goreleaser.yml b/vendor/github.com/klauspost/compress/.goreleaser.yml
new file mode 100644
index 0000000000..4528059ca6
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/.goreleaser.yml
@@ -0,0 +1,123 @@
+version: 2
+
+before:
+  hooks:
+    - ./gen.sh
+
+builds:
+  -
+    id: "s2c"
+    binary: s2c
+    main: ./s2/cmd/s2c/main.go
+    flags:
+      - -trimpath
+    env:
+      - CGO_ENABLED=0
+    goos:
+      - aix
+      - linux
+      - freebsd
+      - netbsd
+      - windows
+      - darwin
+    goarch:
+      - 386
+      - amd64
+      - arm
+      - arm64
+      - ppc64
+      - ppc64le
+      - mips64
+      - mips64le
+    goarm:
+      - 7
+  -
+    id: "s2d"
+    binary: s2d
+    main: ./s2/cmd/s2d/main.go
+    flags:
+      - -trimpath
+    env:
+      - CGO_ENABLED=0
+    goos:
+      - aix
+      - linux
+      - freebsd
+      - netbsd
+      - windows
+      - darwin
+    goarch:
+      - 386
+      - amd64
+      - arm
+      - arm64
+      - ppc64
+      - ppc64le
+      - mips64
+      - mips64le
+    goarm:
+      - 7
+  -
+    id: "s2sx"
+    binary: s2sx
+    main: ./s2/cmd/_s2sx/main.go
+    flags:
+      - -modfile=s2sx.mod
+      - -trimpath
+    env:
+      - CGO_ENABLED=0
+    goos:
+      - aix
+      - linux
+      - freebsd
+      - netbsd
+      - windows
+      - darwin
+    goarch:
+      - 386
+      - amd64
+      - arm
+      - arm64
+      - ppc64
+      - ppc64le
+      - mips64
+      - mips64le
+    goarm:
+      - 7
+
+archives:
+  -
+    id: s2-binaries
+    name_template: "s2-{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
+    format_overrides:
+      - goos: windows
+        format: zip
+    files:
+      - unpack/*
+      - s2/LICENSE
+      - s2/README.md
+checksum:
+  name_template: 'checksums.txt'
+snapshot:
+  version_template: "{{ .Tag }}-next"
+changelog:
+  sort: asc
+  filters:
+    exclude:
+    - '^doc:'
+    - '^docs:'
+    - '^test:'
+    - '^tests:'
+    - '^Update\sREADME.md'
+
+nfpms:
+  -
+    file_name_template: "s2_package__{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
+    vendor: Klaus Post
+    homepage: https://github.com/klauspost/compress
+    maintainer: Klaus Post <klauspost@gmail.com>
+    description: S2 Compression Tool
+    license: BSD 3-Clause
+    formats:
+      - deb
+      - rpm
diff --git a/vendor/github.com/klauspost/compress/LICENSE b/vendor/github.com/klauspost/compress/LICENSE
index 1eb75ef68e..87d5574777 100644
--- a/vendor/github.com/klauspost/compress/LICENSE
+++ b/vendor/github.com/klauspost/compress/LICENSE
@@ -26,3 +26,279 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------------------
+
+Files: gzhttp/*
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2016-2017 The New York Times Company
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+------------------
+
+Files: s2/cmd/internal/readahead/*
+
+The MIT License (MIT)
+
+Copyright (c) 2015 Klaus Post
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+---------------------
+Files: snappy/*
+Files: internal/snapref/*
+
+Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-----------------
+
+Files: s2/cmd/internal/filepathx/*
+
+Copyright 2016 The filepathx Authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
new file mode 100644
index 0000000000..de264c85a5
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -0,0 +1,721 @@
+# compress
+
+This package provides various compression algorithms.
+
+* [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and decompression in pure Go.
+* [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) is a high performance replacement for Snappy.
+* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib).
+* [snappy](https://github.com/klauspost/compress/tree/master/snappy) is a drop-in replacement for `github.com/golang/snappy` offering better compression and concurrent streams.
+* [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding.
+* [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp) Provides client and server wrappers for handling gzipped requests efficiently.
+* [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation.
+
+[![Go Reference](https://pkg.go.dev/badge/klauspost/compress.svg)](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories)
+[![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml)
+[![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge)
+
+# changelog
+
+* Sep 23rd, 2024 - [1.17.10](https://github.com/klauspost/compress/releases/tag/v1.17.10)
+	* gzhttp: Add TransportAlwaysDecompress option. https://github.com/klauspost/compress/pull/978
+	* gzhttp: Add supported decompress request body by @mirecl in https://github.com/klauspost/compress/pull/1002
+	* s2: Add EncodeBuffer buffer recycling callback https://github.com/klauspost/compress/pull/982
+	* zstd: Improve memory usage on small streaming encodes https://github.com/klauspost/compress/pull/1007
+	* flate: read data written with partial flush by @vajexal in https://github.com/klauspost/compress/pull/996
+
+* Jun 12th, 2024 - [1.17.9](https://github.com/klauspost/compress/releases/tag/v1.17.9)
+	* s2: Reduce ReadFrom temporary allocations https://github.com/klauspost/compress/pull/949
+	* flate, zstd: Shave some bytes off amd64 matchLen by @greatroar in https://github.com/klauspost/compress/pull/963
+	* Upgrade zip/zlib to 1.22.4 upstream https://github.com/klauspost/compress/pull/970 https://github.com/klauspost/compress/pull/971
+	* zstd: BuildDict fails with RLE table https://github.com/klauspost/compress/pull/951
+
+* Apr 9th, 2024 - [1.17.8](https://github.com/klauspost/compress/releases/tag/v1.17.8)
+	* zstd: Reject blocks where reserved values are not 0 https://github.com/klauspost/compress/pull/885
+	* zstd: Add RLE detection+encoding https://github.com/klauspost/compress/pull/938
+
+* Feb 21st, 2024 - [1.17.7](https://github.com/klauspost/compress/releases/tag/v1.17.7)
+	* s2: Add AsyncFlush method: Complete the block without flushing by @Jille in https://github.com/klauspost/compress/pull/927
+	* s2: Fix literal+repeat exceeds dst crash https://github.com/klauspost/compress/pull/930
+  
+* Feb 5th, 2024 - [1.17.6](https://github.com/klauspost/compress/releases/tag/v1.17.6)
+	* zstd: Fix incorrect repeat coding in best mode https://github.com/klauspost/compress/pull/923
+	* s2: Fix DecodeConcurrent deadlock on errors https://github.com/klauspost/compress/pull/925
+  
+* Jan 26th, 2024 - [v1.17.5](https://github.com/klauspost/compress/releases/tag/v1.17.5)
+	* flate: Fix reset with dictionary on custom window encodes https://github.com/klauspost/compress/pull/912
+	* zstd: Add Frame header encoding and stripping https://github.com/klauspost/compress/pull/908
+	* zstd: Limit better/best default window to 8MB https://github.com/klauspost/compress/pull/913
+	* zstd: Speed improvements by @greatroar in https://github.com/klauspost/compress/pull/896 https://github.com/klauspost/compress/pull/910
+	* s2: Fix callbacks for skippable blocks and disallow 0xfe (Padding) by @Jille in https://github.com/klauspost/compress/pull/916 https://github.com/klauspost/compress/pull/917
+https://github.com/klauspost/compress/pull/919 https://github.com/klauspost/compress/pull/918
+
+* Dec 1st, 2023 - [v1.17.4](https://github.com/klauspost/compress/releases/tag/v1.17.4)
+	* huff0: Speed up symbol counting by @greatroar in https://github.com/klauspost/compress/pull/887
+	* huff0: Remove byteReader by @greatroar in https://github.com/klauspost/compress/pull/886
+	* gzhttp: Allow overriding decompression on transport https://github.com/klauspost/compress/pull/892
+	* gzhttp: Clamp compression level https://github.com/klauspost/compress/pull/890
+	* gzip: Error out if reserved bits are set https://github.com/klauspost/compress/pull/891
+
+* Nov 15th, 2023 - [v1.17.3](https://github.com/klauspost/compress/releases/tag/v1.17.3)
+	* fse: Fix max header size https://github.com/klauspost/compress/pull/881
+	* zstd: Improve better/best compression https://github.com/klauspost/compress/pull/877
+	* gzhttp: Fix missing content type on Close https://github.com/klauspost/compress/pull/883
+
+* Oct 22nd, 2023 - [v1.17.2](https://github.com/klauspost/compress/releases/tag/v1.17.2)
+	* zstd: Fix rare *CORRUPTION* output in "best" mode. See https://github.com/klauspost/compress/pull/876
+
+* Oct 14th, 2023 - [v1.17.1](https://github.com/klauspost/compress/releases/tag/v1.17.1)
+	* s2: Fix S2 "best" dictionary wrong encoding by @klauspost in https://github.com/klauspost/compress/pull/871
+	* flate: Reduce allocations in decompressor and minor code improvements by @fakefloordiv in https://github.com/klauspost/compress/pull/869
+	* s2: Fix EstimateBlockSize on 6&7 length input by @klauspost in https://github.com/klauspost/compress/pull/867
+
+* Sept 19th, 2023 - [v1.17.0](https://github.com/klauspost/compress/releases/tag/v1.17.0)
+	* Add experimental dictionary builder  https://github.com/klauspost/compress/pull/853
+	* Add xerial snappy read/writer https://github.com/klauspost/compress/pull/838
+	* flate: Add limited window compression https://github.com/klauspost/compress/pull/843
+	* s2: Do 2 overlapping match checks https://github.com/klauspost/compress/pull/839
+	* flate: Add amd64 assembly matchlen https://github.com/klauspost/compress/pull/837
+	* gzip: Copy bufio.Reader on Reset by @thatguystone in https://github.com/klauspost/compress/pull/860
+
+<details>
+	<summary>See changes to v1.16.x</summary>
+
+   
+* July 1st, 2023 - [v1.16.7](https://github.com/klauspost/compress/releases/tag/v1.16.7)
+	* zstd: Fix default level first dictionary encode https://github.com/klauspost/compress/pull/829
+	* s2: add GetBufferCapacity() method by @GiedriusS in https://github.com/klauspost/compress/pull/832
+
+* June 13, 2023 - [v1.16.6](https://github.com/klauspost/compress/releases/tag/v1.16.6)
+	* zstd: correctly ignore WithEncoderPadding(1) by @ianlancetaylor in https://github.com/klauspost/compress/pull/806
+	* zstd: Add amd64 match length assembly https://github.com/klauspost/compress/pull/824
+	* gzhttp: Handle informational headers by @rtribotte in https://github.com/klauspost/compress/pull/815
+	* s2: Improve Better compression slightly https://github.com/klauspost/compress/pull/663
+
+* Apr 16, 2023 - [v1.16.5](https://github.com/klauspost/compress/releases/tag/v1.16.5)
+	* zstd: readByte needs to use io.ReadFull by @jnoxon in https://github.com/klauspost/compress/pull/802
+	* gzip: Fix WriterTo after initial read https://github.com/klauspost/compress/pull/804
+
+* Apr 5, 2023 - [v1.16.4](https://github.com/klauspost/compress/releases/tag/v1.16.4)
+	* zstd: Improve zstd best efficiency by @greatroar and @klauspost in https://github.com/klauspost/compress/pull/784
+	* zstd: Respect WithAllLitEntropyCompression https://github.com/klauspost/compress/pull/792
+	* zstd: Fix amd64 not always detecting corrupt data https://github.com/klauspost/compress/pull/785
+	* zstd: Various minor improvements by @greatroar in https://github.com/klauspost/compress/pull/788 https://github.com/klauspost/compress/pull/794 https://github.com/klauspost/compress/pull/795
+	* s2: Fix huge block overflow https://github.com/klauspost/compress/pull/779
+	* s2: Allow CustomEncoder fallback https://github.com/klauspost/compress/pull/780
+	* gzhttp: Support ResponseWriter Unwrap() in gzhttp handler by @jgimenez in https://github.com/klauspost/compress/pull/799
+
+* Mar 13, 2023 - [v1.16.1](https://github.com/klauspost/compress/releases/tag/v1.16.1)
+	* zstd: Speed up + improve best encoder by @greatroar in https://github.com/klauspost/compress/pull/776
+	* gzhttp: Add optional [BREACH mitigation](https://github.com/klauspost/compress/tree/master/gzhttp#breach-mitigation). https://github.com/klauspost/compress/pull/762 https://github.com/klauspost/compress/pull/768 https://github.com/klauspost/compress/pull/769 https://github.com/klauspost/compress/pull/770 https://github.com/klauspost/compress/pull/767
+	* s2: Add Intel LZ4s converter https://github.com/klauspost/compress/pull/766
+	* zstd: Minor bug fixes https://github.com/klauspost/compress/pull/771 https://github.com/klauspost/compress/pull/772 https://github.com/klauspost/compress/pull/773
+	* huff0: Speed up compress1xDo by @greatroar in https://github.com/klauspost/compress/pull/774
+
+* Feb 26, 2023 - [v1.16.0](https://github.com/klauspost/compress/releases/tag/v1.16.0)
+	* s2: Add [Dictionary](https://github.com/klauspost/compress/tree/master/s2#dictionaries) support.  https://github.com/klauspost/compress/pull/685
+	* s2: Add Compression Size Estimate.  https://github.com/klauspost/compress/pull/752
+	* s2: Add support for custom stream encoder. https://github.com/klauspost/compress/pull/755
+	* s2: Add LZ4 block converter. https://github.com/klauspost/compress/pull/748
+	* s2: Support io.ReaderAt in ReadSeeker. https://github.com/klauspost/compress/pull/747
+	* s2c/s2sx: Use concurrent decoding. https://github.com/klauspost/compress/pull/746
+</details>
+
+<details>
+	<summary>See changes to v1.15.x</summary>
+	
+* Jan 21st, 2023 (v1.15.15)
+	* deflate: Improve level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/739
+	* zstd: Add delta encoding support by @greatroar in https://github.com/klauspost/compress/pull/728
+	* zstd: Various speed improvements by @greatroar https://github.com/klauspost/compress/pull/741 https://github.com/klauspost/compress/pull/734 https://github.com/klauspost/compress/pull/736 https://github.com/klauspost/compress/pull/744 https://github.com/klauspost/compress/pull/743 https://github.com/klauspost/compress/pull/745
+	* gzhttp: Add SuffixETag() and DropETag() options to prevent ETag collisions on compressed responses by @willbicks in https://github.com/klauspost/compress/pull/740
+
+* Jan 3rd, 2023 (v1.15.14)
+
+	* flate: Improve speed in big stateless blocks https://github.com/klauspost/compress/pull/718
+	* zstd: Minor speed tweaks by @greatroar in https://github.com/klauspost/compress/pull/716 https://github.com/klauspost/compress/pull/720
+	* export NoGzipResponseWriter for custom ResponseWriter wrappers by @harshavardhana in https://github.com/klauspost/compress/pull/722
+	* s2: Add example for indexing and existing stream https://github.com/klauspost/compress/pull/723
+
+* Dec 11, 2022 (v1.15.13)
+	* zstd: Add [MaxEncodedSize](https://pkg.go.dev/github.com/klauspost/compress@v1.15.13/zstd#Encoder.MaxEncodedSize) to encoder  https://github.com/klauspost/compress/pull/691
+	* zstd: Various tweaks and improvements https://github.com/klauspost/compress/pull/693 https://github.com/klauspost/compress/pull/695 https://github.com/klauspost/compress/pull/696 https://github.com/klauspost/compress/pull/701 https://github.com/klauspost/compress/pull/702 https://github.com/klauspost/compress/pull/703 https://github.com/klauspost/compress/pull/704 https://github.com/klauspost/compress/pull/705 https://github.com/klauspost/compress/pull/706 https://github.com/klauspost/compress/pull/707 https://github.com/klauspost/compress/pull/708
+
+* Oct 26, 2022 (v1.15.12)
+
+	* zstd: Tweak decoder allocs. https://github.com/klauspost/compress/pull/680
+	* gzhttp: Always delete `HeaderNoCompression` https://github.com/klauspost/compress/pull/683
+
+* Sept 26, 2022 (v1.15.11)
+
+	* flate: Improve level 1-3 compression  https://github.com/klauspost/compress/pull/678
+	* zstd: Improve "best" compression by @nightwolfz in https://github.com/klauspost/compress/pull/677
+	* zstd: Fix+reduce decompression allocations https://github.com/klauspost/compress/pull/668
+	* zstd: Fix non-effective noescape tag https://github.com/klauspost/compress/pull/667
+
+* Sept 16, 2022 (v1.15.10)
+
+	* zstd: Add [WithDecodeAllCapLimit](https://pkg.go.dev/github.com/klauspost/compress@v1.15.10/zstd#WithDecodeAllCapLimit) https://github.com/klauspost/compress/pull/649
+	* Add Go 1.19 - deprecate Go 1.16  https://github.com/klauspost/compress/pull/651
+	* flate: Improve level 5+6 compression https://github.com/klauspost/compress/pull/656
+	* zstd: Improve "better" compression  https://github.com/klauspost/compress/pull/657
+	* s2: Improve "best" compression https://github.com/klauspost/compress/pull/658
+	* s2: Improve "better" compression. https://github.com/klauspost/compress/pull/635
+	* s2: Slightly faster non-assembly decompression https://github.com/klauspost/compress/pull/646
+	* Use arrays for constant size copies https://github.com/klauspost/compress/pull/659
+
+* July 21, 2022 (v1.15.9)
+
+	* zstd: Fix decoder crash on amd64 (no BMI) on invalid input https://github.com/klauspost/compress/pull/645
+	* zstd: Disable decoder extended memory copies (amd64) due to possible crashes https://github.com/klauspost/compress/pull/644
+	* zstd: Allow single segments up to "max decoded size" by @klauspost in https://github.com/klauspost/compress/pull/643
+
+* July 13, 2022 (v1.15.8)
+
+	* gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641
+	* s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638
+	* zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636
+	* zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637
+	* huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634
+	* zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640
+	* gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639
+
+* June 29, 2022 (v1.15.7)
+
+	* s2: Fix absolute forward seeks  https://github.com/klauspost/compress/pull/633
+	* zip: Merge upstream  https://github.com/klauspost/compress/pull/631
+	* zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624
+	* zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598
+	* flate: Faster histograms  https://github.com/klauspost/compress/pull/620
+	* deflate: Use compound hcode  https://github.com/klauspost/compress/pull/622
+
+* June 3, 2022 (v1.15.6)
+	* s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613
+	* s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611
+	* zstd: Always use configured block size https://github.com/klauspost/compress/pull/605
+	* zstd: Fix incorrect hash table placement for dict encoding in default https://github.com/klauspost/compress/pull/606
+	* zstd: Apply default config to ZipDecompressor without options https://github.com/klauspost/compress/pull/608
+	* gzhttp: Exclude more common archive formats https://github.com/klauspost/compress/pull/612
+	* s2: Add ReaderIgnoreCRC https://github.com/klauspost/compress/pull/609
+	* s2: Remove sanity load on index creation https://github.com/klauspost/compress/pull/607
+	* snappy: Use dedicated function for scoring https://github.com/klauspost/compress/pull/614
+	* s2c+s2d: Use official snappy framed extension https://github.com/klauspost/compress/pull/610
+
+* May 25, 2022 (v1.15.5)
+	* s2: Add concurrent stream decompression https://github.com/klauspost/compress/pull/602
+	* s2: Fix final emit oob read crash on amd64 https://github.com/klauspost/compress/pull/601
+	* huff0: asm implementation of Decompress1X by @WojciechMula https://github.com/klauspost/compress/pull/596
+	* zstd: Use 1 less goroutine for stream decoding https://github.com/klauspost/compress/pull/588
+	* zstd: Copy literal in 16 byte blocks when possible https://github.com/klauspost/compress/pull/592
+	* zstd: Speed up when WithDecoderLowmem(false) https://github.com/klauspost/compress/pull/599
+	* zstd: faster next state update in BMI2 version of decode by @WojciechMula in https://github.com/klauspost/compress/pull/593
+	* huff0: Do not check max size when reading table. https://github.com/klauspost/compress/pull/586
+	* flate: Inplace hashing for level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/590
+
+
+* May 11, 2022 (v1.15.4)
+	* huff0: decompress directly into output by @WojciechMula in [#577](https://github.com/klauspost/compress/pull/577)
+	* inflate: Keep dict on stack [#581](https://github.com/klauspost/compress/pull/581)
+	* zstd: Faster decoding memcopy in asm [#583](https://github.com/klauspost/compress/pull/583)
+	* zstd: Fix ignored crc [#580](https://github.com/klauspost/compress/pull/580)
+
+* May 5, 2022 (v1.15.3)
+	* zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)
+	* s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)
+
+* Apr 26, 2022 (v1.15.2)
+	* zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster.  [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537)
+	* zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539)
+	* s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555)
+	* Minimum version is Go 1.16, added CI test on 1.18.
+
+* Mar 11, 2022 (v1.15.1)
+	* huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)
+	* zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)
+	* zstd: Detect extra block data and report as corrupted in [#520](https://github.com/klauspost/compress/pull/520)
+	* zstd: Handle zero sized frame content size stricter in [#521](https://github.com/klauspost/compress/pull/521)
+	* zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)
+
+* Mar 3, 2022 (v1.15.0)
+	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)
+	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)
+	* huff0: Prevent single blocks exceeding 16 bits by @klauspost in[#507](https://github.com/klauspost/compress/pull/507)
+	* flate: Inline literal emission by @klauspost in [#509](https://github.com/klauspost/compress/pull/509)
+	* gzhttp: Add zstd to transport by @klauspost in [#400](https://github.com/klauspost/compress/pull/400)
+	* gzhttp: Make content-type optional by @klauspost in [#510](https://github.com/klauspost/compress/pull/510)
+
+Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines.
+
+Stream decompression is now faster on asynchronous, since the goroutine allocation much more effectively splits the workload. On typical streams this will typically use 2 cores fully for decompression. When a stream has finished decoding no goroutines will be left over, so decoders can now safely be pooled and still be garbage collected.
+
+While the release has been extensively tested, it is recommended to testing when upgrading.
+
+</details>
+
+<details>
+	<summary>See changes to v1.14.x</summary>
+	
+* Feb 22, 2022 (v1.14.4)
+	* flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503)
+	* zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502)
+	* zip: don't read data descriptor early by @saracen in [#501](https://github.com/klauspost/compress/pull/501)  #501
+	* huff0: Use static decompression buffer up to 30% faster by @klauspost in [#499](https://github.com/klauspost/compress/pull/499) [#500](https://github.com/klauspost/compress/pull/500)
+
+* Feb 17, 2022 (v1.14.3)
+	* flate: Improve fastest levels compression speed ~10% more throughput. [#482](https://github.com/klauspost/compress/pull/482) [#489](https://github.com/klauspost/compress/pull/489) [#490](https://github.com/klauspost/compress/pull/490) [#491](https://github.com/klauspost/compress/pull/491) [#494](https://github.com/klauspost/compress/pull/494)  [#478](https://github.com/klauspost/compress/pull/478)
+	* flate: Faster decompression speed, ~5-10%. [#483](https://github.com/klauspost/compress/pull/483)
+	* s2: Faster compression with Go v1.18 and amd64 microarch level 3+. [#484](https://github.com/klauspost/compress/pull/484) [#486](https://github.com/klauspost/compress/pull/486)
+
+* Jan 25, 2022 (v1.14.2)
+	* zstd: improve header decoder by @dsnet  [#476](https://github.com/klauspost/compress/pull/476)
+	* zstd: Add bigger default blocks  [#469](https://github.com/klauspost/compress/pull/469)
+	* zstd: Remove unused decompression buffer [#470](https://github.com/klauspost/compress/pull/470)
+	* zstd: Fix logically dead code by @ningmingxiao [#472](https://github.com/klauspost/compress/pull/472)
+	* flate: Improve level 7-9 [#471](https://github.com/klauspost/compress/pull/471) [#473](https://github.com/klauspost/compress/pull/473)
+	* zstd: Add noasm tag for xxhash [#475](https://github.com/klauspost/compress/pull/475)
+
+* Jan 11, 2022 (v1.14.1)
+	* s2: Add stream index in [#462](https://github.com/klauspost/compress/pull/462)
+	* flate: Speed and efficiency improvements in [#439](https://github.com/klauspost/compress/pull/439) [#461](https://github.com/klauspost/compress/pull/461) [#455](https://github.com/klauspost/compress/pull/455) [#452](https://github.com/klauspost/compress/pull/452) [#458](https://github.com/klauspost/compress/pull/458)
+	* zstd: Performance improvement in [#420]( https://github.com/klauspost/compress/pull/420) [#456](https://github.com/klauspost/compress/pull/456) [#437](https://github.com/klauspost/compress/pull/437) [#467](https://github.com/klauspost/compress/pull/467) [#468](https://github.com/klauspost/compress/pull/468)
+	* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
+	* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
+</details>
+
+<details>
+	<summary>See changes to v1.13.x</summary>
+	
+* Aug 30, 2021 (v1.13.5)
+	* gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425)
+	* s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413)
+	* zstd: pooledZipWriter should return Writers to the same pool [#426](https://github.com/klauspost/compress/pull/426)
+	* Removed golang/snappy as external dependency for tests [#421](https://github.com/klauspost/compress/pull/421)
+
+* Aug 12, 2021 (v1.13.4)
+	* Add [snappy replacement package](https://github.com/klauspost/compress/tree/master/snappy).
+	* zstd: Fix incorrect encoding in "best" mode [#415](https://github.com/klauspost/compress/pull/415)
+
+* Aug 3, 2021 (v1.13.3) 
+	* zstd: Improve Best compression [#404](https://github.com/klauspost/compress/pull/404)
+	* zstd: Fix WriteTo error forwarding [#411](https://github.com/klauspost/compress/pull/411)
+	* gzhttp: Return http.HandlerFunc instead of http.Handler. Unlikely breaking change. [#406](https://github.com/klauspost/compress/pull/406)
+	* s2sx: Fix max size error [#399](https://github.com/klauspost/compress/pull/399)
+	* zstd: Add optional stream content size on reset [#401](https://github.com/klauspost/compress/pull/401)
+	* zstd: use SpeedBestCompression for level >= 10 [#410](https://github.com/klauspost/compress/pull/410)
+
+* Jun 14, 2021 (v1.13.1)
+	* s2: Add full Snappy output support  [#396](https://github.com/klauspost/compress/pull/396)
+	* zstd: Add configurable [Decoder window](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithDecoderMaxWindow) size [#394](https://github.com/klauspost/compress/pull/394)
+	* gzhttp: Add header to skip compression  [#389](https://github.com/klauspost/compress/pull/389)
+	* s2: Improve speed with bigger output margin  [#395](https://github.com/klauspost/compress/pull/395)
+
+* Jun 3, 2021 (v1.13.0)
+	* Added [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp#gzip-handler) which allows wrapping HTTP servers and clients with GZIP compressors.
+	* zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)
+	* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)
+</details>
+
+
+<details>
+	<summary>See changes to v1.12.x</summary>
+	
+* May 25, 2021 (v1.12.3)
+	* deflate: Better/faster Huffman encoding [#374](https://github.com/klauspost/compress/pull/374)
+	* deflate: Allocate less for history. [#375](https://github.com/klauspost/compress/pull/375)
+	* zstd: Forward read errors [#373](https://github.com/klauspost/compress/pull/373) 
+
+* Apr 27, 2021 (v1.12.2)
+	* zstd: Improve better/best compression [#360](https://github.com/klauspost/compress/pull/360) [#364](https://github.com/klauspost/compress/pull/364) [#365](https://github.com/klauspost/compress/pull/365)
+	* zstd: Add helpers to compress/decompress zstd inside zip files [#363](https://github.com/klauspost/compress/pull/363)
+	* deflate: Improve level 5+6 compression [#367](https://github.com/klauspost/compress/pull/367)
+	* s2: Improve better/best compression [#358](https://github.com/klauspost/compress/pull/358) [#359](https://github.com/klauspost/compress/pull/358)
+	* s2: Load after checking src limit on amd64. [#362](https://github.com/klauspost/compress/pull/362)
+	* s2sx: Limit max executable size [#368](https://github.com/klauspost/compress/pull/368) 
+
+* Apr 14, 2021 (v1.12.1)
+	* snappy package removed. Upstream added as dependency.
+	* s2: Better compression in "best" mode [#353](https://github.com/klauspost/compress/pull/353)
+	* s2sx: Add stdin input and detect pre-compressed from signature [#352](https://github.com/klauspost/compress/pull/352)
+	* s2c/s2d: Add http as possible input [#348](https://github.com/klauspost/compress/pull/348)
+	* s2c/s2d/s2sx: Always truncate when writing files [#352](https://github.com/klauspost/compress/pull/352)
+	* zstd: Reduce memory usage further when using [WithLowerEncoderMem](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithLowerEncoderMem) [#346](https://github.com/klauspost/compress/pull/346)
+	* s2: Fix potential problem with amd64 assembly and profilers [#349](https://github.com/klauspost/compress/pull/349)
+</details>
+
+<details>
+	<summary>See changes to v1.11.x</summary>
+	
+* Mar 26, 2021 (v1.11.13)
+	* zstd: Big speedup on small dictionary encodes [#344](https://github.com/klauspost/compress/pull/344) [#345](https://github.com/klauspost/compress/pull/345)
+	* zstd: Add [WithLowerEncoderMem](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithLowerEncoderMem) encoder option [#336](https://github.com/klauspost/compress/pull/336)
+	* deflate: Improve entropy compression [#338](https://github.com/klauspost/compress/pull/338)
+	* s2: Clean up and minor performance improvement in best [#341](https://github.com/klauspost/compress/pull/341)
+
+* Mar 5, 2021 (v1.11.12)
+	* s2: Add `s2sx` binary that creates [self extracting archives](https://github.com/klauspost/compress/tree/master/s2#s2sx-self-extracting-archives).
+	* s2: Speed up decompression on non-assembly platforms [#328](https://github.com/klauspost/compress/pull/328)
+
+* Mar 1, 2021 (v1.11.9)
+	* s2: Add ARM64 decompression assembly. Around 2x output speed. [#324](https://github.com/klauspost/compress/pull/324)
+	* s2: Improve "better" speed and efficiency. [#325](https://github.com/klauspost/compress/pull/325)
+	* s2: Fix binaries.
+
+* Feb 25, 2021 (v1.11.8)
+	* s2: Fixed occasional out-of-bounds write on amd64. Upgrade recommended.
+	* s2: Add AMD64 assembly for better mode. 25-50% faster. [#315](https://github.com/klauspost/compress/pull/315)
+	* s2: Less upfront decoder allocation. [#322](https://github.com/klauspost/compress/pull/322)
+	* zstd: Faster "compression" of incompressible data. [#314](https://github.com/klauspost/compress/pull/314)
+	* zip: Fix zip64 headers. [#313](https://github.com/klauspost/compress/pull/313)
+  
+* Jan 14, 2021 (v1.11.7)
+	* Use Bytes() interface to get bytes across packages. [#309](https://github.com/klauspost/compress/pull/309)
+	* s2: Add 'best' compression option.  [#310](https://github.com/klauspost/compress/pull/310)
+	* s2: Add ReaderMaxBlockSize, changes `s2.NewReader` signature to include varargs. [#311](https://github.com/klauspost/compress/pull/311)
+	* s2: Fix crash on small better buffers. [#308](https://github.com/klauspost/compress/pull/308)
+	* s2: Clean up decoder. [#312](https://github.com/klauspost/compress/pull/312)
+
+* Jan 7, 2021 (v1.11.6)
+	* zstd: Make decoder allocations smaller [#306](https://github.com/klauspost/compress/pull/306)
+	* zstd: Free Decoder resources when Reset is called with a nil io.Reader  [#305](https://github.com/klauspost/compress/pull/305)
+
+* Dec 20, 2020 (v1.11.4)
+	* zstd: Add Best compression mode [#304](https://github.com/klauspost/compress/pull/304)
+	* Add header decoder [#299](https://github.com/klauspost/compress/pull/299)
+	* s2: Add uncompressed stream option [#297](https://github.com/klauspost/compress/pull/297)
+	* Simplify/speed up small blocks with known max size. [#300](https://github.com/klauspost/compress/pull/300)
+	* zstd: Always reset literal dict encoder [#303](https://github.com/klauspost/compress/pull/303)
+
+* Nov 15, 2020 (v1.11.3)
+	* inflate: 10-15% faster decompression  [#293](https://github.com/klauspost/compress/pull/293)
+	* zstd: Tweak DecodeAll default allocation [#295](https://github.com/klauspost/compress/pull/295)
+
+* Oct 11, 2020 (v1.11.2)
+	* s2: Fix out of bounds read in "better" block compression [#291](https://github.com/klauspost/compress/pull/291)
+
+* Oct 1, 2020 (v1.11.1)
+	* zstd: Set allLitEntropy true in default configuration [#286](https://github.com/klauspost/compress/pull/286)
+
+* Sept 8, 2020 (v1.11.0)
+	* zstd: Add experimental compression [dictionaries](https://github.com/klauspost/compress/tree/master/zstd#dictionaries) [#281](https://github.com/klauspost/compress/pull/281)
+	* zstd: Fix mixed Write and ReadFrom calls [#282](https://github.com/klauspost/compress/pull/282)
+	* inflate/gz: Limit variable shifts, ~5% faster decompression [#274](https://github.com/klauspost/compress/pull/274)
+</details>
+
+<details>
+	<summary>See changes to v1.10.x</summary>
+ 
+* July 8, 2020 (v1.10.11) 
+	* zstd: Fix extra block when compressing with ReadFrom. [#278](https://github.com/klauspost/compress/pull/278)
+	* huff0: Also populate compression table when reading decoding table. [#275](https://github.com/klauspost/compress/pull/275)
+	
+* June 23, 2020 (v1.10.10) 
+	* zstd: Skip entropy compression in fastest mode when no matches. [#270](https://github.com/klauspost/compress/pull/270)
+	
+* June 16, 2020 (v1.10.9): 
+	* zstd: API change for specifying dictionaries. See [#268](https://github.com/klauspost/compress/pull/268)
+	* zip: update CreateHeaderRaw to handle zip64 fields. [#266](https://github.com/klauspost/compress/pull/266)
+	* Fuzzit tests removed. The service has been purchased and is no longer available.
+	
+* June 5, 2020 (v1.10.8): 
+	* 1.15x faster zstd block decompression. [#265](https://github.com/klauspost/compress/pull/265)
+	
+* June 1, 2020 (v1.10.7): 
+	* Added zstd decompression [dictionary support](https://github.com/klauspost/compress/tree/master/zstd#dictionaries)
+	* Increase zstd decompression speed up to 1.19x.  [#259](https://github.com/klauspost/compress/pull/259)
+	* Remove internal reset call in zstd compression and reduce allocations. [#263](https://github.com/klauspost/compress/pull/263)
+	
+* May 21, 2020: (v1.10.6) 
+	* zstd: Reduce allocations while decoding. [#258](https://github.com/klauspost/compress/pull/258), [#252](https://github.com/klauspost/compress/pull/252)
+	* zstd: Stricter decompression checks.
+	
+* April 12, 2020: (v1.10.5)
+	* s2-commands: Flush output when receiving SIGINT. [#239](https://github.com/klauspost/compress/pull/239)
+	
+* Apr 8, 2020: (v1.10.4) 
+	* zstd: Minor/special case optimizations. [#251](https://github.com/klauspost/compress/pull/251),  [#250](https://github.com/klauspost/compress/pull/250),  [#249](https://github.com/klauspost/compress/pull/249),  [#247](https://github.com/klauspost/compress/pull/247)
+* Mar 11, 2020: (v1.10.3) 
+	* s2: Use S2 encoder in pure Go mode for Snappy output as well. [#245](https://github.com/klauspost/compress/pull/245)
+	* s2: Fix pure Go block encoder. [#244](https://github.com/klauspost/compress/pull/244)
+	* zstd: Added "better compression" mode. [#240](https://github.com/klauspost/compress/pull/240)
+	* zstd: Improve speed of fastest compression mode by 5-10% [#241](https://github.com/klauspost/compress/pull/241)
+	* zstd: Skip creating encoders when not needed. [#238](https://github.com/klauspost/compress/pull/238)
+	
+* Feb 27, 2020: (v1.10.2) 
+	* Close to 50% speedup in inflate (gzip/zip decompression). [#236](https://github.com/klauspost/compress/pull/236) [#234](https://github.com/klauspost/compress/pull/234) [#232](https://github.com/klauspost/compress/pull/232)
+	* Reduce deflate level 1-6 memory usage up to 59%. [#227](https://github.com/klauspost/compress/pull/227)
+	
+* Feb 18, 2020: (v1.10.1)
+	* Fix zstd crash when resetting multiple times without sending data. [#226](https://github.com/klauspost/compress/pull/226)
+	* deflate: Fix dictionary use on level 1-6. [#224](https://github.com/klauspost/compress/pull/224)
+	* Remove deflate writer reference when closing. [#224](https://github.com/klauspost/compress/pull/224)
+	
+* Feb 4, 2020: (v1.10.0) 
+	* Add optional dictionary to [stateless deflate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc#StatelessDeflate). Breaking change, send `nil` for previous behaviour. [#216](https://github.com/klauspost/compress/pull/216)
+	* Fix buffer overflow on repeated small block deflate.  [#218](https://github.com/klauspost/compress/pull/218)
+	* Allow copying content from an existing ZIP file without decompressing+compressing. [#214](https://github.com/klauspost/compress/pull/214)
+	* Added [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) AMD64 assembler and various optimizations. Stream speed >10GB/s.  [#186](https://github.com/klauspost/compress/pull/186)
+
+</details>
+
+<details>
+	<summary>See changes prior to v1.10.0</summary>
+
+* Jan 20,2020 (v1.9.8) Optimize gzip/deflate with better size estimates and faster table generation. [#207](https://github.com/klauspost/compress/pull/207) by [luyu6056](https://github.com/luyu6056),  [#206](https://github.com/klauspost/compress/pull/206).
+* Jan 11, 2020: S2 Encode/Decode will use provided buffer if capacity is big enough. [#204](https://github.com/klauspost/compress/pull/204) 
+* Jan 5, 2020: (v1.9.7) Fix another zstd regression in v1.9.5 - v1.9.6 removed.
+* Jan 4, 2020: (v1.9.6) Regression in v1.9.5 fixed causing corrupt zstd encodes in rare cases.
+* Jan 4, 2020: Faster IO in [s2c + s2d commandline tools](https://github.com/klauspost/compress/tree/master/s2#commandline-tools) compression/decompression. [#192](https://github.com/klauspost/compress/pull/192)
+* Dec 29, 2019: Removed v1.9.5 since fuzz tests showed a compatibility problem with the reference zstandard decoder.
+* Dec 29, 2019: (v1.9.5) zstd: 10-20% faster block compression. [#199](https://github.com/klauspost/compress/pull/199)
+* Dec 29, 2019: [zip](https://godoc.org/github.com/klauspost/compress/zip) package updated with latest Go features
+* Dec 29, 2019: zstd: Single segment flag condintions tweaked. [#197](https://github.com/klauspost/compress/pull/197)
+* Dec 18, 2019: s2: Faster compression when ReadFrom is used. [#198](https://github.com/klauspost/compress/pull/198)
+* Dec 10, 2019: s2: Fix repeat length output when just above at 16MB limit.
+* Dec 10, 2019: zstd: Add function to get decoder as io.ReadCloser. [#191](https://github.com/klauspost/compress/pull/191)
+* Dec 3, 2019: (v1.9.4) S2: limit max repeat length. [#188](https://github.com/klauspost/compress/pull/188)
+* Dec 3, 2019: Add [WithNoEntropyCompression](https://godoc.org/github.com/klauspost/compress/zstd#WithNoEntropyCompression) to zstd [#187](https://github.com/klauspost/compress/pull/187)
+* Dec 3, 2019: Reduce memory use for tests. Check for leaked goroutines.
+* Nov 28, 2019 (v1.9.3) Less allocations in stateless deflate.
+* Nov 28, 2019: 5-20% Faster huff0 decode. Impacts zstd as well. [#184](https://github.com/klauspost/compress/pull/184)
+* Nov 12, 2019 (v1.9.2) Added [Stateless Compression](#stateless-compression) for gzip/deflate.
+* Nov 12, 2019: Fixed zstd decompression of large single blocks. [#180](https://github.com/klauspost/compress/pull/180)
+* Nov 11, 2019: Set default  [s2c](https://github.com/klauspost/compress/tree/master/s2#commandline-tools) block size to 4MB.
+* Nov 11, 2019: Reduce inflate memory use by 1KB.
+* Nov 10, 2019: Less allocations in deflate bit writer.
+* Nov 10, 2019: Fix inconsistent error returned by zstd decoder.
+* Oct 28, 2019 (v1.9.1) ztsd: Fix crash when compressing blocks. [#174](https://github.com/klauspost/compress/pull/174)
+* Oct 24, 2019 (v1.9.0) zstd: Fix rare data corruption [#173](https://github.com/klauspost/compress/pull/173)
+* Oct 24, 2019 zstd: Fix huff0 out of buffer write [#171](https://github.com/klauspost/compress/pull/171) and always return errors [#172](https://github.com/klauspost/compress/pull/172) 
+* Oct 10, 2019: Big deflate rewrite, 30-40% faster with better compression [#105](https://github.com/klauspost/compress/pull/105)
+
+</details>
+
+<details>
+	<summary>See changes prior to v1.9.0</summary>
+
+* Oct 10, 2019: (v1.8.6) zstd: Allow partial reads to get flushed data. [#169](https://github.com/klauspost/compress/pull/169)
+* Oct 3, 2019: Fix inconsistent results on broken zstd streams.
+* Sep 25, 2019: Added `-rm` (remove source files) and `-q` (no output except errors) to `s2c` and `s2d` [commands](https://github.com/klauspost/compress/tree/master/s2#commandline-tools)
+* Sep 16, 2019: (v1.8.4) Add `s2c` and `s2d` [commandline tools](https://github.com/klauspost/compress/tree/master/s2#commandline-tools).
+* Sep 10, 2019: (v1.8.3) Fix s2 decoder [Skip](https://godoc.org/github.com/klauspost/compress/s2#Reader.Skip).
+* Sep 7, 2019: zstd: Added [WithWindowSize](https://godoc.org/github.com/klauspost/compress/zstd#WithWindowSize), contributed by [ianwilkes](https://github.com/ianwilkes).
+* Sep 5, 2019: (v1.8.2) Add [WithZeroFrames](https://godoc.org/github.com/klauspost/compress/zstd#WithZeroFrames) which adds full zero payload block encoding option.
+* Sep 5, 2019: Lazy initialization of zstandard predefined en/decoder tables.
+* Aug 26, 2019: (v1.8.1) S2: 1-2% compression increase in "better" compression mode.
+* Aug 26, 2019: zstd: Check maximum size of Huffman 1X compressed literals while decoding.
+* Aug 24, 2019: (v1.8.0) Added [S2 compression](https://github.com/klauspost/compress/tree/master/s2#s2-compression), a high performance replacement for Snappy. 
+* Aug 21, 2019: (v1.7.6) Fixed minor issues found by fuzzer. One could lead to zstd not decompressing.
+* Aug 18, 2019: Add [fuzzit](https://fuzzit.dev/) continuous fuzzing.
+* Aug 14, 2019: zstd: Skip incompressible data 2x faster.  [#147](https://github.com/klauspost/compress/pull/147)
+* Aug 4, 2019 (v1.7.5): Better literal compression. [#146](https://github.com/klauspost/compress/pull/146)
+* Aug 4, 2019: Faster zstd compression. [#143](https://github.com/klauspost/compress/pull/143) [#144](https://github.com/klauspost/compress/pull/144)
+* Aug 4, 2019: Faster zstd decompression. [#145](https://github.com/klauspost/compress/pull/145) [#143](https://github.com/klauspost/compress/pull/143) [#142](https://github.com/klauspost/compress/pull/142)
+* July 15, 2019 (v1.7.4): Fix double EOF block in rare cases on zstd encoder.
+* July 15, 2019 (v1.7.3): Minor speedup/compression increase in default zstd encoder.
+* July 14, 2019: zstd decoder: Fix decompression error on multiple uses with mixed content.
+* July 7, 2019 (v1.7.2): Snappy update, zstd decoder potential race fix.
+* June 17, 2019: zstd decompression bugfix.
+* June 17, 2019: fix 32 bit builds.
+* June 17, 2019: Easier use in modules (less dependencies).
+* June 9, 2019: New stronger "default" [zstd](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression mode. Matches zstd default compression ratio.
+* June 5, 2019: 20-40% throughput in [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and better compression.
+* June 5, 2019: deflate/gzip compression: Reduce memory usage of lower compression levels.
+* June 2, 2019: Added [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression!
+* May 25, 2019: deflate/gzip: 10% faster bit writer, mostly visible in lower levels.
+* Apr 22, 2019: [zstd](https://github.com/klauspost/compress/tree/master/zstd#zstd) decompression added.
+* Aug 1, 2018: Added [huff0 README](https://github.com/klauspost/compress/tree/master/huff0#huff0-entropy-compression).
+* Jul 8, 2018: Added [Performance Update 2018](#performance-update-2018) below.
+* Jun 23, 2018: Merged [Go 1.11 inflate optimizations](https://go-review.googlesource.com/c/go/+/102235). Go 1.9 is now required. Backwards compatible version tagged with [v1.3.0](https://github.com/klauspost/compress/releases/tag/v1.3.0).
+* Apr 2, 2018: Added [huff0](https://godoc.org/github.com/klauspost/compress/huff0) en/decoder. Experimental for now, API may change.
+* Mar 4, 2018: Added [FSE Entropy](https://godoc.org/github.com/klauspost/compress/fse) en/decoder. Experimental for now, API may change.
+* Nov 3, 2017: Add compression [Estimate](https://godoc.org/github.com/klauspost/compress#Estimate) function.
+* May 28, 2017: Reduce allocations when resetting decoder.
+* Apr 02, 2017: Change back to official crc32, since changes were merged in Go 1.7.
+* Jan 14, 2017: Reduce stack pressure due to array copies. See [Issue #18625](https://github.com/golang/go/issues/18625).
+* Oct 25, 2016: Level 2-4 have been rewritten and now offers significantly better performance than before.
+* Oct 20, 2016: Port zlib changes from Go 1.7 to fix zlib writer issue. Please update.
+* Oct 16, 2016: Go 1.7 changes merged. Apples to apples this package is a few percent faster, but has a significantly better balance between speed and compression per level. 
+* Mar 24, 2016: Always attempt Huffman encoding on level 4-7. This improves base 64 encoded data compression.
+* Mar 24, 2016: Small speedup for level 1-3.
+* Feb 19, 2016: Faster bit writer, level -2 is 15% faster, level 1 is 4% faster.
+* Feb 19, 2016: Handle small payloads faster in level 1-3.
+* Feb 19, 2016: Added faster level 2 + 3 compression modes.
+* Feb 19, 2016: [Rebalanced compression levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/), so there is a more even progression in terms of compression. New default level is 5.
+* Feb 14, 2016: Snappy: Merge upstream changes. 
+* Feb 14, 2016: Snappy: Fix aggressive skipping.
+* Feb 14, 2016: Snappy: Update benchmark.
+* Feb 13, 2016: Deflate: Fixed assembler problem that could lead to sub-optimal compression.
+* Feb 12, 2016: Snappy: Added AMD64 SSE 4.2 optimizations to matching, which makes easy to compress material run faster. Typical speedup is around 25%.
+* Feb 9, 2016: Added Snappy package fork. This version is 5-7% faster, much more on hard to compress content.
+* Jan 30, 2016: Optimize level 1 to 3 by not considering static dictionary or storing uncompressed. ~4-5% speedup.
+* Jan 16, 2016: Optimization on deflate level 1,2,3 compression.
+* Jan 8 2016: Merge [CL 18317](https://go-review.googlesource.com/#/c/18317): fix reading, writing of zip64 archives.
+* Dec 8 2015: Make level 1 and -2 deterministic even if write size differs.
+* Dec 8 2015: Split encoding functions, so hashing and matching can potentially be inlined. 1-3% faster on AMD64. 5% faster on other platforms.
+* Dec 8 2015: Fixed rare [one byte out-of bounds read](https://github.com/klauspost/compress/issues/20). Please update!
+* Nov 23 2015: Optimization on token writer. ~2-4% faster. Contributed by [@dsnet](https://github.com/dsnet).
+* Nov 20 2015: Small optimization to bit writer on 64 bit systems.
+* Nov 17 2015: Fixed out-of-bound errors if the underlying Writer returned an error. See [#15](https://github.com/klauspost/compress/issues/15).
+* Nov 12 2015: Added [io.WriterTo](https://golang.org/pkg/io/#WriterTo) support to gzip/inflate.
+* Nov 11 2015: Merged [CL 16669](https://go-review.googlesource.com/#/c/16669/4): archive/zip: enable overriding (de)compressors per file
+* Oct 15 2015: Added skipping on uncompressible data. Random data speed up >5x.
+
+</details>
+
+# deflate usage
+
+The packages are drop-in replacements for standard libraries. Simply replace the import path to use them:
+
+| old import         | new import                              | Documentation
+|--------------------|-----------------------------------------|--------------------|
+| `compress/gzip`    | `github.com/klauspost/compress/gzip`    | [gzip](https://pkg.go.dev/github.com/klauspost/compress/gzip?tab=doc)
+| `compress/zlib`    | `github.com/klauspost/compress/zlib`    | [zlib](https://pkg.go.dev/github.com/klauspost/compress/zlib?tab=doc)
+| `archive/zip`      | `github.com/klauspost/compress/zip`     | [zip](https://pkg.go.dev/github.com/klauspost/compress/zip?tab=doc)
+| `compress/flate`   | `github.com/klauspost/compress/flate`   | [flate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc)
+
+* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib).
+
+You may also be interested in [pgzip](https://github.com/klauspost/pgzip), which is a drop in replacement for gzip, which support multithreaded compression on big files and the optimized [crc32](https://github.com/klauspost/crc32) package used by these packages.
+
+The packages contains the same as the standard library, so you can use the godoc for that: [gzip](http://golang.org/pkg/compress/gzip/), [zip](http://golang.org/pkg/archive/zip/),  [zlib](http://golang.org/pkg/compress/zlib/), [flate](http://golang.org/pkg/compress/flate/).
+
+Currently there is only minor speedup on decompression (mostly CRC32 calculation).
+
+Memory usage is typically 1MB for a Writer. stdlib is in the same range. 
+If you expect to have a lot of concurrently allocated Writers consider using 
+the stateless compress described below.
+
+For compression performance, see: [this spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing).
+
+To disable all assembly add `-tags=noasm`. This works across all packages.
+
+# Stateless compression
+
+This package offers stateless compression as a special option for gzip/deflate. 
+It will do compression but without maintaining any state between Write calls.
+
+This means there will be no memory kept between Write calls, but compression and speed will be suboptimal.
+
+This is only relevant in cases where you expect to run many thousands of compressors concurrently, 
+but with very little activity. This is *not* intended for regular web servers serving individual requests.  
+
+Because of this, the size of actual Write calls will affect output size.
+
+In gzip, specify level `-3` / `gzip.StatelessCompression` to enable.
+
+For direct deflate use, NewStatelessWriter and StatelessDeflate are available. See [documentation](https://godoc.org/github.com/klauspost/compress/flate#NewStatelessWriter)
+
+A `bufio.Writer` can of course be used to control write sizes. For example, to use a 4KB buffer:
+
+```go
+	// replace 'ioutil.Discard' with your output.
+	gzw, err := gzip.NewWriterLevel(ioutil.Discard, gzip.StatelessCompression)
+	if err != nil {
+		return err
+	}
+	defer gzw.Close()
+
+	w := bufio.NewWriterSize(gzw, 4096)
+	defer w.Flush()
+	
+	// Write to 'w' 
+```
+
+This will only use up to 4KB in memory when the writer is idle. 
+
+Compression is almost always worse than the fastest compression level 
+and each write will allocate (a little) memory. 
+
+# Performance Update 2018
+
+It has been a while since we have been looking at the speed of this package compared to the standard library, so I thought I would re-do my tests and give some overall recommendations based on the current state. All benchmarks have been performed with Go 1.10 on my Desktop Intel(R) Core(TM) i7-2600 CPU @3.40GHz. Since I last ran the tests, I have gotten more RAM, which means tests with big files are no longer limited by my SSD.
+
+The raw results are in my [updated spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing). Due to cgo changes and upstream updates i could not get the cgo version of gzip to compile. Instead I included the [zstd](https://github.com/datadog/zstd) cgo implementation. If I get cgo gzip to work again, I might replace the results in the sheet.
+
+The columns to take note of are: *MB/s* - the throughput. *Reduction* - the data size reduction in percent of the original. *Rel Speed* relative speed compared to the standard library at the same level. *Smaller* - how many percent smaller is the compressed output compared to stdlib. Negative means the output was bigger. *Loss* means the loss (or gain) in compression as a percentage difference of the input.
+
+The `gzstd` (standard library gzip) and `gzkp` (this package gzip) only uses one CPU core. [`pgzip`](https://github.com/klauspost/pgzip), [`bgzf`](https://github.com/biogo/hts/tree/master/bgzf) uses all 4 cores. [`zstd`](https://github.com/DataDog/zstd) uses one core, and is a beast (but not Go, yet).
+
+
+## Overall differences.
+
+There appears to be a roughly 5-10% speed advantage over the standard library when comparing at similar compression levels.
+
+The biggest difference you will see is the result of [re-balancing](https://blog.klauspost.com/rebalancing-deflate-compression-levels/) the compression levels. I wanted by library to give a smoother transition between the compression levels than the standard library.
+
+This package attempts to provide a more smooth transition, where "1" is taking a lot of shortcuts, "5" is the reasonable trade-off and "9" is the "give me the best compression", and the values in between gives something reasonable in between. The standard library has big differences in levels 1-4, but levels 5-9 having no significant gains - often spending a lot more time than can be justified by the achieved compression.
+
+There are links to all the test data in the [spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing) in the top left field on each tab.
+
+## Web Content
+
+This test set aims to emulate typical use in a web server. The test-set is 4GB data in 53k files, and is a mixture of (mostly) HTML, JS, CSS.
+
+Since level 1 and 9 are close to being the same code, they are quite close. But looking at the levels in-between the differences are quite big.
+
+Looking at level 6, this package is 88% faster, but will output about 6% more data. For a web server, this means you can serve 88% more data, but have to pay for 6% more bandwidth. You can draw your own conclusions on what would be the most expensive for your case.
+
+## Object files
+
+This test is for typical data files stored on a server. In this case it is a collection of Go precompiled objects. They are very compressible.
+
+The picture is similar to the web content, but with small differences since this is very compressible. Levels 2-3 offer good speed, but is sacrificing quite a bit of compression. 
+
+The standard library seems suboptimal on level 3 and 4 - offering both worse compression and speed than level 6 & 7 of this package respectively.
+
+## Highly Compressible File
+
+This is a JSON file with very high redundancy. The reduction starts at 95% on level 1, so in real life terms we are dealing with something like a highly redundant stream of data, etc.
+
+It is definitely visible that we are dealing with specialized content here, so the results are very scattered. This package does not do very well at levels 1-4, but picks up significantly at level 5 and levels 7 and 8 offering great speed for the achieved compression.
+
+So if you know you content is extremely compressible you might want to go slightly higher than the defaults. The standard library has a huge gap between levels 3 and 4 in terms of speed (2.75x slowdown), so it offers little "middle ground".
+
+## Medium-High Compressible
+
+This is a pretty common test corpus: [enwik9](http://mattmahoney.net/dc/textdata.html). It contains the first 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006. This is a very good test of typical text based compression and more data heavy streams.
+
+We see a similar picture here as in "Web Content". On equal levels some compression is sacrificed for more speed. Level 5 seems to be the best trade-off between speed and size, beating stdlib level 3 in both.
+
+## Medium Compressible
+
+I will combine two test sets, one [10GB file set](http://mattmahoney.net/dc/10gb.html) and a VM disk image (~8GB). Both contain different data types and represent a typical backup scenario.
+
+The most notable thing is how quickly the standard library drops to very low compression speeds around level 5-6 without any big gains in compression. Since this type of data is fairly common, this does not seem like good behavior.
+
+
+## Un-compressible Content
+
+This is mainly a test of how good the algorithms are at detecting un-compressible input. The standard library only offers this feature with very conservative settings at level 1. Obviously there is no reason for the algorithms to try to compress input that cannot be compressed.  The only downside is that it might skip some compressible data on false detections.
+
+
+## Huffman only compression
+
+This compression library adds a special compression level, named `HuffmanOnly`, which allows near linear time compression. This is done by completely disabling matching of previous data, and only reduce the number of bits to represent each character. 
+
+This means that often used characters, like 'e' and ' ' (space) in text use the fewest bits to represent, and rare characters like '¤' takes more bits to represent. For more information see [wikipedia](https://en.wikipedia.org/wiki/Huffman_coding) or this nice [video](https://youtu.be/ZdooBTdW5bM).
+
+Since this type of compression has much less variance, the compression speed is mostly unaffected by the input data, and is usually more than *180MB/s* for a single core.
+
+The downside is that the compression ratio is usually considerably worse than even the fastest conventional compression. The compression ratio can never be better than 8:1 (12.5%). 
+
+The linear time compression can be used as a "better than nothing" mode, where you cannot risk the encoder to slow down on some content. For comparison, the size of the "Twain" text is *233460 bytes* (+29% vs. level 1) and encode speed is 144MB/s (4.5x level 1). So in this case you trade a 30% size increase for a 4 times speedup.
+
+For more information see my blog post on [Fast Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/).
+
+This is implemented on Go 1.7 as "Huffman Only" mode, though not exposed for gzip.
+
+# Other packages
+
+Here are other packages of good quality and pure Go (no cgo wrappers or autoconverted code):
+
+* [github.com/pierrec/lz4](https://github.com/pierrec/lz4) - strong multithreaded LZ4 compression.
+* [github.com/cosnicolaou/pbzip2](https://github.com/cosnicolaou/pbzip2) - multithreaded bzip2 decompression.
+* [github.com/dsnet/compress](https://github.com/dsnet/compress) - brotli decompression, bzip2 writer.
+* [github.com/ronanh/intcomp](https://github.com/ronanh/intcomp) - Integer compression.
+* [github.com/spenczar/fpc](https://github.com/spenczar/fpc) - Float compression.
+* [github.com/minio/zipindex](https://github.com/minio/zipindex) - External ZIP directory index.
+* [github.com/ybirader/pzip](https://github.com/ybirader/pzip) - Fast concurrent zip archiver and extractor.
+
+# license
+
+This code is licensed under the same conditions as the original Go code. See LICENSE file.
diff --git a/vendor/github.com/klauspost/compress/SECURITY.md b/vendor/github.com/klauspost/compress/SECURITY.md
new file mode 100644
index 0000000000..ca6685e2b7
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/SECURITY.md
@@ -0,0 +1,25 @@
+# Security Policy
+
+## Supported Versions
+
+Security updates are applied only to the latest release.
+
+## Vulnerability Definition
+
+A security vulnerability is a bug that with certain input triggers a crash or an infinite loop. Most calls will have varying execution time and only in rare cases will slow operation be considered a security vulnerability.
+
+Corrupted output generally is not considered a security vulnerability, unless independent operations are able to affect each other. Note that not all functionality is re-entrant and safe to use concurrently.
+
+Out-of-memory crashes only applies if the en/decoder uses an abnormal amount of memory, with appropriate options applied, to limit maximum window size, concurrency, etc. However, if you are in doubt you are welcome to file a security issue.
+
+It is assumed that all callers are trusted, meaning internal data exposed through reflection or inspection of returned data structures is not considered a vulnerability.
+
+Vulnerabilities resulting from compiler/assembler errors should be reported upstream. Depending on the severity this package may or may not implement a workaround.
+
+## Reporting a Vulnerability
+
+If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
+
+Please disclose it at [security advisory](https://github.com/klauspost/compress/security/advisories/new). If possible please provide a minimal reproducer. If the issue only applies to a single platform, it would be helpful to provide access to that.
+
+This project is maintained by a team of volunteers on a reasonable-effort basis. As such, vulnerabilities will be disclosed in a best effort base.
diff --git a/vendor/github.com/klauspost/compress/compressible.go b/vendor/github.com/klauspost/compress/compressible.go
new file mode 100644
index 0000000000..ea5a692d51
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/compressible.go
@@ -0,0 +1,85 @@
+package compress
+
+import "math"
+
+// Estimate returns a normalized compressibility estimate of block b.
+// Values close to zero are likely uncompressible.
+// Values above 0.1 are likely to be compressible.
+// Values above 0.5 are very compressible.
+// Very small lengths will return 0.
+func Estimate(b []byte) float64 {
+	if len(b) < 16 {
+		return 0
+	}
+
+	// Correctly predicted order 1
+	hits := 0
+	lastMatch := false
+	var o1 [256]byte
+	var hist [256]int
+	c1 := byte(0)
+	for _, c := range b {
+		if c == o1[c1] {
+			// We only count a hit if there was two correct predictions in a row.
+			if lastMatch {
+				hits++
+			}
+			lastMatch = true
+		} else {
+			lastMatch = false
+		}
+		o1[c1] = c
+		c1 = c
+		hist[c]++
+	}
+
+	// Use x^0.6 to give better spread
+	prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)
+
+	// Calculate histogram distribution
+	variance := float64(0)
+	avg := float64(len(b)) / 256
+
+	for _, v := range hist {
+		Δ := float64(v) - avg
+		variance += Δ * Δ
+	}
+
+	stddev := math.Sqrt(float64(variance)) / float64(len(b))
+	exp := math.Sqrt(1 / float64(len(b)))
+
+	// Subtract expected stddev
+	stddev -= exp
+	if stddev < 0 {
+		stddev = 0
+	}
+	stddev *= 1 + exp
+
+	// Use x^0.4 to give better spread
+	entropy := math.Pow(stddev, 0.4)
+
+	// 50/50 weight between prediction and histogram distribution
+	return math.Pow((prediction+entropy)/2, 0.9)
+}
+
+// ShannonEntropyBits returns the number of bits minimum required to represent
+// an entropy encoding of the input bytes.
+// https://en.wiktionary.org/wiki/Shannon_entropy
+func ShannonEntropyBits(b []byte) int {
+	if len(b) == 0 {
+		return 0
+	}
+	var hist [256]int
+	for _, c := range b {
+		hist[c]++
+	}
+	shannon := float64(0)
+	invTotal := 1.0 / float64(len(b))
+	for _, v := range hist[:] {
+		if v > 0 {
+			n := float64(v)
+			shannon += math.Ceil(-math.Log2(n*invTotal) * n)
+		}
+	}
+	return int(math.Ceil(shannon))
+}
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
index 25dbe3e15f..af53fb860c 100644
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -6,6 +6,8 @@
 package flate
 
 import (
+	"encoding/binary"
+	"errors"
 	"fmt"
 	"io"
 	"math"
@@ -37,15 +39,17 @@ const (
 	maxMatchLength   = 258 // The longest match for the compressor
 	minOffsetSize    = 1   // The shortest offset that makes any sense
 
-	// The maximum number of tokens we put into a single flat block, just too
-	// stop things from getting too large.
-	maxFlateBlockTokens = 1 << 14
+	// The maximum number of tokens we will encode at the time.
+	// Smaller sizes usually creates less optimal blocks.
+	// Bigger can make context switching slow.
+	// We use this for levels 7-9, so we make it big.
+	maxFlateBlockTokens = 1 << 15
 	maxStoreBlockSize   = 65535
 	hashBits            = 17 // After 17 performance degrades
 	hashSize            = 1 << hashBits
 	hashMask            = (1 << hashBits) - 1
 	hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
-	maxHashOffset       = 1 << 24
+	maxHashOffset       = 1 << 28
 
 	skipNever = math.MaxInt32
 
@@ -70,9 +74,9 @@ var levels = []compressionLevel{
 	{0, 0, 0, 0, 0, 6},
 	// Levels 7-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
-	{8, 8, 24, 16, skipNever, 7},
-	{10, 16, 24, 64, skipNever, 8},
-	{32, 258, 258, 4096, skipNever, 9},
+	{8, 12, 16, 24, skipNever, 7},
+	{16, 30, 40, 64, skipNever, 8},
+	{32, 258, 258, 1024, skipNever, 9},
 }
 
 // advancedState contains state for the advanced levels, with bigger hash tables, etc.
@@ -81,28 +85,28 @@ type advancedState struct {
 	length         int
 	offset         int
 	maxInsertIndex int
+	chainHead      int
+	hashOffset     int
 
-	// Input hash chains
-	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
-	// If hashHead[hashValue] is within the current window, then
-	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
-	// with the same hash value.
-	chainHead  int
-	hashHead   [hashSize]uint32
-	hashPrev   [windowSize]uint32
-	hashOffset int
+	ii uint16 // position of last match, intended to overflow to reset.
 
 	// input window: unprocessed data is window[index:windowEnd]
 	index     int
 	hashMatch [maxMatchLength + minMatchLength]uint32
 
-	hash uint32
-	ii   uint16 // position of last match, intended to overflow to reset.
+	// Input hash chains
+	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
+	// If hashHead[hashValue] is within the current window, then
+	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
+	// with the same hash value.
+	hashHead [hashSize]uint32
+	hashPrev [windowSize]uint32
 }
 
 type compressor struct {
 	compressionLevel
 
+	h *huffmanEncoder
 	w *huffmanBitWriter
 
 	// compression algorithm
@@ -127,7 +131,8 @@ func (d *compressor) fillDeflate(b []byte) int {
 	s := d.state
 	if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
 		// shift the window by windowSize
-		copy(d.window[:], d.window[windowSize:2*windowSize])
+		//copy(d.window[:], d.window[windowSize:2*windowSize])
+		*(*[windowSize]byte)(d.window) = *(*[windowSize]byte)(d.window[windowSize:])
 		s.index -= windowSize
 		d.windowEnd -= windowSize
 		if d.blockStart >= windowSize {
@@ -170,7 +175,8 @@ func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
 			window = d.window[d.blockStart:index]
 		}
 		d.blockStart = index
-		d.w.writeBlock(tok, eof, window)
+		//d.w.writeBlock(tok, eof, window)
+		d.w.writeBlockDynamic(tok, eof, window, d.sync)
 		return d.w.err
 	}
 	return nil
@@ -206,7 +212,7 @@ func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error {
 // Should only be used after a start/reset.
 func (d *compressor) fillWindow(b []byte) {
 	// Do not fill window if we are in store-only or huffman mode.
-	if d.level <= 0 {
+	if d.level <= 0 && d.level > -MinCustomWindowSize {
 		return
 	}
 	if d.fast != nil {
@@ -253,7 +259,6 @@ func (d *compressor) fillWindow(b []byte) {
 			// Set the head of the hash chain to us.
 			s.hashHead[newH] = uint32(di + s.hashOffset)
 		}
-		s.hash = newH
 	}
 	// Update window information.
 	d.windowEnd += n
@@ -263,7 +268,7 @@ func (d *compressor) fillWindow(b []byte) {
 // Try to find a match starting at index whose length is greater than prevSize.
 // We only look at chainCount possibilities before giving up.
 // pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead
-func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
+func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) {
 	minMatchLook := maxMatchLength
 	if lookahead < minMatchLook {
 		minMatchLook = lookahead
@@ -279,36 +284,78 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
 
 	// If we've got a match that's good enough, only look in 1/4 the chain.
 	tries := d.chain
-	length = prevLength
-	if length >= d.good {
-		tries >>= 2
-	}
+	length = minMatchLength - 1
 
 	wEnd := win[pos+length]
 	wPos := win[pos:]
 	minIndex := pos - windowSize
+	if minIndex < 0 {
+		minIndex = 0
+	}
+	offset = 0
+
+	if d.chain < 100 {
+		for i := prevHead; tries > 0; tries-- {
+			if wEnd == win[i+length] {
+				n := matchLen(win[i:i+minMatchLook], wPos)
+				if n > length {
+					length = n
+					offset = pos - i
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
+				}
+			}
+			if i <= minIndex {
+				// hashPrev[i & windowMask] has already been overwritten, so stop now.
+				break
+			}
+			i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+			if i < minIndex {
+				break
+			}
+		}
+		return
+	}
+
+	// Minimum gain to accept a match.
+	cGain := 4
+
+	// Some like it higher (CSV), some like it lower (JSON)
+	const baseCost = 3
+	// Base is 4 bytes at with an additional cost.
+	// Matches must be better than this.
 
 	for i := prevHead; tries > 0; tries-- {
 		if wEnd == win[i+length] {
 			n := matchLen(win[i:i+minMatchLook], wPos)
-
-			if n > length && (n > minMatchLength || pos-i <= 4096) {
-				length = n
-				offset = pos - i
-				ok = true
-				if n >= nice {
-					// The match is good enough that we don't try to find a better one.
-					break
+			if n > length {
+				// Calculate gain. Estimate
+				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])
+
+				//fmt.Println("gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]), "this-len:", n, "prev-len:", length)
+				if newGain > cGain {
+					length = n
+					offset = pos - i
+					cGain = newGain
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
 				}
-				wEnd = win[pos+n]
 			}
 		}
-		if i == minIndex {
+		if i <= minIndex {
 			// hashPrev[i & windowMask] has already been overwritten, so stop now.
 			break
 		}
 		i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
-		if i < minIndex || i < 0 {
+		if i < minIndex {
 			break
 		}
 	}
@@ -327,8 +374,13 @@ func (d *compressor) writeStoredBlock(buf []byte) error {
 // of the supplied slice.
 // The caller must ensure that len(b) >= 4.
 func hash4(b []byte) uint32 {
-	b = b[:4]
-	return hash4u(uint32(b[3])|uint32(b[2])<<8|uint32(b[1])<<16|uint32(b[0])<<24, hashBits)
+	return hash4u(binary.LittleEndian.Uint32(b), hashBits)
+}
+
+// hash4 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4u(u uint32, h uint8) uint32 {
+	return (u * prime4bytes) >> (32 - h)
 }
 
 // bulkHash4 will compute hashes using the same
@@ -337,11 +389,12 @@ func bulkHash4(b []byte, dst []uint32) {
 	if len(b) < 4 {
 		return
 	}
-	hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+	hb := binary.LittleEndian.Uint32(b)
+
 	dst[0] = hash4u(hb, hashBits)
 	end := len(b) - 4 + 1
 	for i := 1; i < end; i++ {
-		hb = (hb << 8) | uint32(b[i+3])
+		hb = (hb >> 8) | uint32(b[i+3])<<24
 		dst[i] = hash4u(hb, hashBits)
 	}
 }
@@ -358,7 +411,6 @@ func (d *compressor) initDeflate() {
 	s.hashOffset = 1
 	s.length = minMatchLength - 1
 	s.offset = 0
-	s.hash = 0
 	s.chainHead = -1
 }
 
@@ -374,11 +426,19 @@ func (d *compressor) deflateLazy() {
 	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
 		return
 	}
+	if d.windowEnd != s.index && d.chain > 100 {
+		// Get literal huffman coder.
+		if d.h == nil {
+			d.h = newHuffmanEncoder(maxFlateBlockTokens)
+		}
+		var tmp [256]uint16
+		for _, v := range d.window[s.index:d.windowEnd] {
+			tmp[v]++
+		}
+		d.h.generate(tmp[:], 15)
+	}
 
 	s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
-	if s.index < s.maxInsertIndex {
-		s.hash = hash4(d.window[s.index : s.index+minMatchLength])
-	}
 
 	for {
 		if sanity && s.index > d.windowEnd {
@@ -410,11 +470,11 @@ func (d *compressor) deflateLazy() {
 		}
 		if s.index < s.maxInsertIndex {
 			// Update the hash
-			s.hash = hash4(d.window[s.index : s.index+minMatchLength])
-			ch := s.hashHead[s.hash&hashMask]
+			hash := hash4(d.window[s.index:])
+			ch := s.hashHead[hash]
 			s.chainHead = int(ch)
 			s.hashPrev[s.index&windowMask] = ch
-			s.hashHead[s.hash&hashMask] = uint32(s.index + s.hashOffset)
+			s.hashHead[hash] = uint32(s.index + s.hashOffset)
 		}
 		prevLength := s.length
 		prevOffset := s.offset
@@ -426,12 +486,113 @@ func (d *compressor) deflateLazy() {
 		}
 
 		if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
-			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok {
+			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok {
 				s.length = newLength
 				s.offset = newOffset
 			}
 		}
+
 		if prevLength >= minMatchLength && s.length <= prevLength {
+			// No better match, but check for better match at end...
+			//
+			// Skip forward a number of bytes.
+			// Offset of 2 seems to yield best results. 3 is sometimes better.
+			const checkOff = 2
+
+			// Check all, except full length
+			if prevLength < maxMatchLength-checkOff {
+				prevIndex := s.index - 1
+				if prevIndex+prevLength < s.maxInsertIndex {
+					end := lookahead
+					if lookahead > maxMatchLength+checkOff {
+						end = maxMatchLength + checkOff
+					}
+					end += prevIndex
+
+					// Hash at match end.
+					h := hash4(d.window[prevIndex+prevLength:])
+					ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+					if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+						length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+						// It seems like a pure length metric is best.
+						if length > prevLength {
+							prevLength = length
+							prevOffset = prevIndex - ch2
+
+							// Extend back...
+							for i := checkOff - 1; i >= 0; i-- {
+								if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i] {
+									// Emit tokens we "owe"
+									for j := 0; j <= i; j++ {
+										d.tokens.AddLiteral(d.window[prevIndex+j])
+										if d.tokens.n == maxFlateBlockTokens {
+											// The block includes the current character
+											if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+												return
+											}
+											d.tokens.Reset()
+										}
+										s.index++
+										if s.index < s.maxInsertIndex {
+											h := hash4(d.window[s.index:])
+											ch := s.hashHead[h]
+											s.chainHead = int(ch)
+											s.hashPrev[s.index&windowMask] = ch
+											s.hashHead[h] = uint32(s.index + s.hashOffset)
+										}
+									}
+									break
+								} else {
+									prevLength++
+								}
+							}
+						} else if false {
+							// Check one further ahead.
+							// Only rarely better, disabled for now.
+							prevIndex++
+							h := hash4(d.window[prevIndex+prevLength:])
+							ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+							if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+								length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+								// It seems like a pure length metric is best.
+								if length > prevLength+checkOff {
+									prevLength = length
+									prevOffset = prevIndex - ch2
+									prevIndex--
+
+									// Extend back...
+									for i := checkOff; i >= 0; i-- {
+										if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i-1] {
+											// Emit tokens we "owe"
+											for j := 0; j <= i; j++ {
+												d.tokens.AddLiteral(d.window[prevIndex+j])
+												if d.tokens.n == maxFlateBlockTokens {
+													// The block includes the current character
+													if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+														return
+													}
+													d.tokens.Reset()
+												}
+												s.index++
+												if s.index < s.maxInsertIndex {
+													h := hash4(d.window[s.index:])
+													ch := s.hashHead[h]
+													s.chainHead = int(ch)
+													s.hashPrev[s.index&windowMask] = ch
+													s.hashHead[h] = uint32(s.index + s.hashOffset)
+												}
+											}
+											break
+										} else {
+											prevLength++
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
 			// There was a match at the previous step, and the current match is
 			// not better. Output the previous match.
 			d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
@@ -440,8 +601,7 @@ func (d *compressor) deflateLazy() {
 			// index and index-1 are already inserted. If there is not enough
 			// lookahead, the last two strings are not inserted into the hash
 			// table.
-			var newIndex int
-			newIndex = s.index + prevLength - 1
+			newIndex := s.index + prevLength - 1
 			// Calculate missing hashes
 			end := newIndex
 			if end > s.maxInsertIndex {
@@ -467,7 +627,6 @@ func (d *compressor) deflateLazy() {
 					// Set the head of the hash chain to us.
 					s.hashHead[newH] = uint32(di + s.hashOffset)
 				}
-				s.hash = newH
 			}
 
 			s.index = newIndex
@@ -480,6 +639,7 @@ func (d *compressor) deflateLazy() {
 				}
 				d.tokens.Reset()
 			}
+			s.ii = 0
 		} else {
 			// Reset, if we got a match this run.
 			if s.length >= minMatchLength {
@@ -499,13 +659,12 @@ func (d *compressor) deflateLazy() {
 
 				// If we have a long run of no matches, skip additional bytes
 				// Resets when s.ii overflows after 64KB.
-				if s.ii > 31 {
-					n := int(s.ii >> 5)
+				if n := int(s.ii) - d.chain; n > 0 {
+					n = 1 + int(n>>6)
 					for j := 0; j < n; j++ {
 						if s.index >= d.windowEnd-1 {
 							break
 						}
-
 						d.tokens.AddLiteral(d.window[s.index-1])
 						if d.tokens.n == maxFlateBlockTokens {
 							if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
@@ -513,6 +672,14 @@ func (d *compressor) deflateLazy() {
 							}
 							d.tokens.Reset()
 						}
+						// Index...
+						if s.index < s.maxInsertIndex {
+							h := hash4(d.window[s.index:])
+							ch := s.hashHead[h]
+							s.chainHead = int(ch)
+							s.hashPrev[s.index&windowMask] = ch
+							s.hashHead[h] = uint32(s.index + s.hashOffset)
+						}
 						s.index++
 					}
 					// Flush last byte
@@ -612,7 +779,9 @@ func (d *compressor) write(b []byte) (n int, err error) {
 	}
 	n = len(b)
 	for len(b) > 0 {
-		d.step(d)
+		if d.windowEnd == len(d.window) || d.sync {
+			d.step(d)
+		}
 		b = b[d.fill(d, b):]
 		if d.err != nil {
 			return 0, d.err
@@ -645,26 +814,32 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).store
 	case level == ConstantCompression:
-		d.w.logNewTablePenalty = 4
-		d.window = make([]byte, maxStoreBlockSize)
+		d.w.logNewTablePenalty = 10
+		d.window = make([]byte, 32<<10)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeHuff
 	case level == DefaultCompression:
 		level = 5
 		fallthrough
 	case level >= 1 && level <= 6:
-		d.w.logNewTablePenalty = 6
+		d.w.logNewTablePenalty = 7
 		d.fast = newFastEnc(level)
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeFast
 	case 7 <= level && level <= 9:
-		d.w.logNewTablePenalty = 10
+		d.w.logNewTablePenalty = 8
 		d.state = &advancedState{}
 		d.compressionLevel = levels[level]
 		d.initDeflate()
 		d.fill = (*compressor).fillDeflate
 		d.step = (*compressor).deflateLazy
+	case -level >= MinCustomWindowSize && -level <= MaxCustomWindowSize:
+		d.w.logNewTablePenalty = 7
+		d.fast = &fastEncL5Window{maxOffset: int32(-level), cur: maxStoreBlockSize}
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeFast
 	default:
 		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
 	}
@@ -686,7 +861,7 @@ func (d *compressor) reset(w io.Writer) {
 	}
 	switch d.compressionLevel.chain {
 	case 0:
-		// level was NoCompression or ConstantCompresssion.
+		// level was NoCompression or ConstantCompression.
 		d.windowEnd = 0
 	default:
 		s := d.state
@@ -703,7 +878,6 @@ func (d *compressor) reset(w io.Writer) {
 		d.tokens.Reset()
 		s.length = minMatchLength - 1
 		s.offset = 0
-		s.hash = 0
 		s.ii = 0
 		s.maxInsertIndex = 0
 	}
@@ -762,6 +936,28 @@ func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
 	return zw, err
 }
 
+// MinCustomWindowSize is the minimum window size that can be sent to NewWriterWindow.
+const MinCustomWindowSize = 32
+
+// MaxCustomWindowSize is the maximum custom window that can be sent to NewWriterWindow.
+const MaxCustomWindowSize = windowSize
+
+// NewWriterWindow returns a new Writer compressing data with a custom window size.
+// windowSize must be from MinCustomWindowSize to MaxCustomWindowSize.
+func NewWriterWindow(w io.Writer, windowSize int) (*Writer, error) {
+	if windowSize < MinCustomWindowSize {
+		return nil, errors.New("flate: requested window size less than MinWindowSize")
+	}
+	if windowSize > MaxCustomWindowSize {
+		return nil, errors.New("flate: requested window size bigger than MaxCustomWindowSize")
+	}
+	var dw Writer
+	if err := dw.d.init(w, -windowSize); err != nil {
+		return nil, err
+	}
+	return &dw, nil
+}
+
 // A Writer takes data written to it and writes the compressed
 // form of that data to an underlying writer (see NewWriter).
 type Writer struct {
diff --git a/vendor/github.com/klauspost/compress/flate/dict_decoder.go b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
index 71c75a065e..bb36351a5a 100644
--- a/vendor/github.com/klauspost/compress/flate/dict_decoder.go
+++ b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
@@ -7,19 +7,19 @@ package flate
 // dictDecoder implements the LZ77 sliding dictionary as used in decompression.
 // LZ77 decompresses data through sequences of two forms of commands:
 //
-//	* Literal insertions: Runs of one or more symbols are inserted into the data
-//	stream as is. This is accomplished through the writeByte method for a
-//	single symbol, or combinations of writeSlice/writeMark for multiple symbols.
-//	Any valid stream must start with a literal insertion if no preset dictionary
-//	is used.
+//   - Literal insertions: Runs of one or more symbols are inserted into the data
+//     stream as is. This is accomplished through the writeByte method for a
+//     single symbol, or combinations of writeSlice/writeMark for multiple symbols.
+//     Any valid stream must start with a literal insertion if no preset dictionary
+//     is used.
 //
-//	* Backward copies: Runs of one or more symbols are copied from previously
-//	emitted data. Backward copies come as the tuple (dist, length) where dist
-//	determines how far back in the stream to copy from and length determines how
-//	many bytes to copy. Note that it is valid for the length to be greater than
-//	the distance. Since LZ77 uses forward copies, that situation is used to
-//	perform a form of run-length encoding on repeated runs of symbols.
-//	The writeCopy and tryWriteCopy are used to implement this command.
+//   - Backward copies: Runs of one or more symbols are copied from previously
+//     emitted data. Backward copies come as the tuple (dist, length) where dist
+//     determines how far back in the stream to copy from and length determines how
+//     many bytes to copy. Note that it is valid for the length to be greater than
+//     the distance. Since LZ77 uses forward copies, that situation is used to
+//     perform a form of run-length encoding on repeated runs of symbols.
+//     The writeCopy and tryWriteCopy are used to implement this command.
 //
 // For performance reasons, this implementation performs little to no sanity
 // checks about the arguments. As such, the invariants documented for each
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
index 4a73e1bdd3..c8124b5c49 100644
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -6,8 +6,8 @@
 package flate
 
 import (
+	"encoding/binary"
 	"fmt"
-	"math/bits"
 )
 
 type fastEnc interface {
@@ -44,7 +44,7 @@ const (
 
 	bTableBits   = 17                                               // Bits used in the big tables
 	bTableSize   = 1 << bTableBits                                  // Size of the table
-	allocHistory = maxStoreBlockSize * 10                           // Size to preallocate for history.
+	allocHistory = maxStoreBlockSize * 5                            // Size to preallocate for history.
 	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this.
 )
 
@@ -57,38 +57,12 @@ const (
 	prime8bytes = 0xcf1bbcdcb7a56463
 )
 
-func load32(b []byte, i int) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load64(b []byte, i int) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
-}
-
 func load3232(b []byte, i int32) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return binary.LittleEndian.Uint32(b[i:])
 }
 
 func load6432(b []byte, i int32) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
-}
-
-func hash(u uint32) uint32 {
-	return (u * 0x1e35a7bd) >> tableShift
+	return binary.LittleEndian.Uint64(b[i:])
 }
 
 type tableEntry struct {
@@ -114,7 +88,8 @@ func (e *fastGen) addBlock(src []byte) int32 {
 			}
 			// Move down
 			offset := int32(len(e.hist)) - maxMatchOffset
-			copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			// copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			*(*[maxMatchOffset]byte)(e.hist) = *(*[maxMatchOffset]byte)(e.hist[offset:])
 			e.cur += offset
 			e.hist = e.hist[:maxMatchOffset]
 		}
@@ -124,39 +99,36 @@ func (e *fastGen) addBlock(src []byte) int32 {
 	return s
 }
 
-// hash4 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4u(u uint32, h uint8) uint32 {
-	return (u * prime4bytes) >> ((32 - h) & reg8SizeMask32)
-}
-
 type tableEntryPrev struct {
 	Cur  tableEntry
 	Prev tableEntry
 }
 
-// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4x64(u uint64, h uint8) uint32 {
-	return (uint32(u) * prime4bytes) >> ((32 - h) & reg8SizeMask32)
-}
-
 // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash7(u uint64, h uint8) uint32 {
 	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64))
 }
 
-// hash8 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash8(u uint64, h uint8) uint32 {
-	return uint32((u * prime8bytes) >> ((64 - h) & reg8SizeMask64))
-}
-
-// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash6(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & reg8SizeMask64))
+// hashLen returns a hash of the lowest mls bytes of with length output bits.
+// mls must be >=3 and <=8. Any other value will return hash for 4 bytes.
+// length should always be < 32.
+// Preferably length and mls should be a constant for inlining.
+func hashLen(u uint64, length, mls uint8) uint32 {
+	switch mls {
+	case 3:
+		return (uint32(u<<8) * prime3bytes) >> (32 - length)
+	case 5:
+		return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length))
+	case 6:
+		return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length))
+	case 7:
+		return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length))
+	case 8:
+		return uint32((u * prime8bytes) >> (64 - length))
+	default:
+		return (uint32(u) * prime4bytes) >> (32 - length)
+	}
 }
 
 // matchlen will return the match length between offsets and t in src.
@@ -189,7 +161,7 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
 // matchlenLong will return the match length between offsets and t in src.
 // It is assumed that s > t, that t >=0 and s < len(src).
 func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
-	if debugDecode {
+	if debugDeflate {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
 		}
@@ -219,36 +191,3 @@ func (e *fastGen) Reset() {
 	}
 	e.hist = e.hist[:0]
 }
-
-// matchLen returns the maximum length.
-// 'a' must be the shortest of the two.
-func matchLen(a, b []byte) int {
-	b = b[:len(a)]
-	var checked int
-	if len(a) > 4 {
-		// Try 4 bytes first
-		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
-			return bits.TrailingZeros32(diff) >> 3
-		}
-		// Switch to 8 byte matching.
-		checked = 4
-		a = a[4:]
-		b = b[4:]
-		for len(a) >= 8 {
-			b = b[:len(a)]
-			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
-				return checked + (bits.TrailingZeros64(diff) >> 3)
-			}
-			checked += 8
-			a = a[8:]
-			b = b[8:]
-		}
-	}
-	b = b[:len(a)]
-	for i := range a {
-		if a[i] != b[i] {
-			return int(i) + checked
-		}
-	}
-	return len(a) + checked
-}
diff --git a/vendor/github.com/klauspost/compress/flate/gen_inflate.go b/vendor/github.com/klauspost/compress/flate/gen_inflate.go
deleted file mode 100644
index 35fc072a3e..0000000000
--- a/vendor/github.com/klauspost/compress/flate/gen_inflate.go
+++ /dev/null
@@ -1,294 +0,0 @@
-// +build generate
-
-//go:generate go run $GOFILE && gofmt -w inflate_gen.go
-
-package main
-
-import (
-	"os"
-	"strings"
-)
-
-func main() {
-	f, err := os.Create("inflate_gen.go")
-	if err != nil {
-		panic(err)
-	}
-	defer f.Close()
-	types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader"}
-	names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader"}
-	imports := []string{"bytes", "bufio", "io", "strings", "math/bits"}
-	f.WriteString(`// Code generated by go generate gen_inflate.go. DO NOT EDIT.
-
-package flate
-
-import (
-`)
-
-	for _, imp := range imports {
-		f.WriteString("\t\"" + imp + "\"\n")
-	}
-	f.WriteString(")\n\n")
-
-	template := `
-
-// Decode a single Huffman block from f.
-// hl and hd are the Huffman states for the lit/length values
-// and the distance values, respectively. If hd == nil, using the
-// fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) $FUNCNAME$() {
-	const (
-		stateInit = iota // Zero value must be stateInit
-		stateDict
-	)
-	fr := f.r.($TYPE$)
-
-	switch f.stepState {
-	case stateInit:
-		goto readLiteral
-	case stateDict:
-		goto copyHistory
-	}
-
-readLiteral:
-	// Read literal and/or (length, distance) according to RFC section 3.2.3.
-	{
-		var v int
-		{
-			// Inlined v, err := f.huffSym(f.hl)
-			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
-			// with single element, huffSym must error on these two edge cases. In both
-			// cases, the chunks slice will be 0 for the invalid sequence, leading it
-			// satisfy the n == 0 check below.
-			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
-			for {
-				for nb < n {
-					c, err := fr.ReadByte()
-					if err != nil {
-						f.b = b
-						f.nb = nb
-						f.err = noEOF(err)
-						return
-					}
-					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
-				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
-				n = uint(chunk & huffmanCountMask)
-				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
-					n = uint(chunk & huffmanCountMask)
-				}
-				if n <= nb {
-					if n == 0 {
-						f.b = b
-						f.nb = nb
-						if debugDecode {
-							fmt.Println("huffsym: n==0")
-						}
-						f.err = CorruptInputError(f.roffset)
-						return
-					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
-					v = int(chunk >> huffmanValueShift)
-					break
-				}
-			}
-		}
-
-		var length int
-		switch {
-		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).$FUNCNAME$
-				f.stepState = stateInit
-				return
-			}
-			goto readLiteral
-		case v == 256:
-			f.finishBlock()
-			return
-		// otherwise, reference to older data
-		case v < 265:
-			length = v - (257 - 3)
-		case v < maxNumLit:
-			val := decCodeToLen[(v - 257)]
-			length = int(val.length) + 3
-			n := uint(val.extra)
-			for f.nb < n {
-				c, err := fr.ReadByte()
-				if err != nil {
-					if debugDecode {
-						fmt.Println("morebits n>0:", err)
-					}
-					f.err = err
-					return
-				}
-				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8	
-			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
-		default:
-			if debugDecode {
-				fmt.Println(v, ">= maxNumLit")
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		var dist uint32
-		if f.hd == nil {
-			for f.nb < 5 {
-				c, err := fr.ReadByte()
-				if err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<5:", err)
-					}
-					f.err = err
-					return
-				}
-				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
-			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
-		} else {
-			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
-			// with single element, huffSym must error on these two edge cases. In both
-			// cases, the chunks slice will be 0 for the invalid sequence, leading it
-			// satisfy the n == 0 check below.
-			n := uint(f.hd.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
-			for {
-				for nb < n {
-					c, err := fr.ReadByte()
-					if err != nil {
-						f.b = b
-						f.nb = nb
-						f.err = noEOF(err)
-						return
-					}
-					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
-				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
-				n = uint(chunk & huffmanCountMask)
-				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
-					n = uint(chunk & huffmanCountMask)
-				}
-				if n <= nb {
-					if n == 0 {
-						f.b = b
-						f.nb = nb
-						if debugDecode {
-							fmt.Println("huffsym: n==0")
-						}
-						f.err = CorruptInputError(f.roffset)
-						return
-					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
-					dist = uint32(chunk >> huffmanValueShift)
-					break
-				}
-			}
-		}
-
-		switch {
-		case dist < 4:
-			dist++
-		case dist < maxNumDist:
-			nb := uint(dist-2) >> 1
-			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
-				c, err := fr.ReadByte()
-				if err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<nb:", err)
-					}
-					f.err = err
-					return
-				}
-				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
-			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
-			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
-		default:
-			if debugDecode {
-				fmt.Println("dist too big:", dist, maxNumDist)
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
-			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		f.copyLen, f.copyDist = length, int(dist)
-		goto copyHistory
-	}
-
-copyHistory:
-	// Perform a backwards copy according to RFC section 3.2.3.
-	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
-		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
-		}
-		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).$FUNCNAME$ // We need to continue this work
-			f.stepState = stateDict
-			return
-		}
-		goto readLiteral
-	}
-}
-
-`
-	for i, t := range types {
-		s := strings.Replace(template, "$FUNCNAME$", "huffman"+names[i], -1)
-		s = strings.Replace(s, "$TYPE$", t, -1)
-		f.WriteString(s)
-	}
-	f.WriteString("func (f *decompressor) huffmanBlockDecoder() func() {\n")
-	f.WriteString("\tswitch f.r.(type) {\n")
-	for i, t := range types {
-		f.WriteString("\t\tcase " + t + ":\n")
-		f.WriteString("\t\t\treturn f.huffman" + names[i] + "\n")
-	}
-	f.WriteString("\t\tdefault:\n")
-	f.WriteString("\t\t\treturn f.huffmanBlockGeneric")
-	f.WriteString("\t}\n}\n")
-}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
index 208d66711d..f70594c34e 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -5,7 +5,10 @@
 package flate
 
 import (
+	"encoding/binary"
+	"fmt"
 	"io"
+	"math"
 )
 
 const (
@@ -22,20 +25,22 @@ const (
 	codegenCodeCount = 19
 	badCode          = 255
 
+	// maxPredefinedTokens is the maximum number of tokens
+	// where we check if fixed size is smaller.
+	maxPredefinedTokens = 250
+
 	// bufferFlushSize indicates the buffer size
 	// after which bytes are flushed to the writer.
 	// Should preferably be a multiple of 6, since
 	// we accumulate 6 bytes between writes to the buffer.
-	bufferFlushSize = 240
-
-	// bufferSize is the actual output byte buffer size.
-	// It must have additional headroom for a flush
-	// which can contain up to 8 bytes.
-	bufferSize = bufferFlushSize + 8
+	bufferFlushSize = 246
 )
 
+// Minimum length code that emits bits.
+const lengthExtraBitsMinCode = 8
+
 // The number of extra bits needed by length code X - LENGTH_CODES_START.
-var lengthExtraBits = [32]int8{
+var lengthExtraBits = [32]uint8{
 	/* 257 */ 0, 0, 0,
 	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
 	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
@@ -49,28 +54,41 @@ var lengthBase = [32]uint8{
 	64, 80, 96, 112, 128, 160, 192, 224, 255,
 }
 
+// Minimum offset code that emits bits.
+const offsetExtraBitsMinCode = 4
+
 // offset code word extra bits.
-var offsetExtraBits = [64]int8{
+var offsetExtraBits = [32]int8{
 	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
 	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
 	/* extended window */
-	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
+	14, 14,
 }
 
-var offsetBase = [64]uint32{
-	/* normal deflate */
-	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
-	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
-	0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
-	0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
-	0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
-	0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+var offsetCombined = [32]uint32{}
 
-	/* extended window */
-	0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
-	0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
-	0x100000, 0x180000, 0x200000, 0x300000,
+func init() {
+	var offsetBase = [32]uint32{
+		/* normal deflate */
+		0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
+		0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
+		0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
+		0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
+		0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
+		0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+
+		/* extended window */
+		0x008000, 0x00c000,
+	}
+
+	for i := range offsetCombined[:] {
+		// Don't use extended window values...
+		if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 {
+			continue
+		}
+		offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8)
+	}
 }
 
 // The odd order in which the codegen code sizes are written.
@@ -85,17 +103,18 @@ type huffmanBitWriter struct {
 	// Data waiting to be written is bytes[0:nbytes]
 	// and then the low nbits of bits.
 	bits            uint64
-	nbits           uint16
+	nbits           uint8
 	nbytes          uint8
+	lastHuffMan     bool
 	literalEncoding *huffmanEncoder
+	tmpLitEncoding  *huffmanEncoder
 	offsetEncoding  *huffmanEncoder
 	codegenEncoding *huffmanEncoder
 	err             error
 	lastHeader      int
 	// Set between 0 (reused block can be up to 2x the size)
 	logNewTablePenalty uint
-	lastHuffMan        bool
-	bytes              [256]byte
+	bytes              [256 + 8]byte
 	literalFreq        [lengthCodesStart + 32]uint16
 	offsetFreq         [32]uint16
 	codegenFreq        [codegenCodeCount]uint16
@@ -127,6 +146,7 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 	return &huffmanBitWriter{
 		writer:          w,
 		literalEncoding: newHuffmanEncoder(literalCount),
+		tmpLitEncoding:  newHuffmanEncoder(literalCount),
 		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
 		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
 	}
@@ -139,37 +159,33 @@ func (w *huffmanBitWriter) reset(writer io.Writer) {
 	w.lastHuffMan = false
 }
 
-func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) {
-	offsets, lits = true, true
+func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
 	a := t.offHist[:offsetCodeCount]
-	b := w.offsetFreq[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			offsets = false
-			break
+	b := w.offsetEncoding.codes
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
 		}
 	}
 
 	a = t.extraHist[:literalCount-256]
-	b = w.literalFreq[256:literalCount]
+	b = w.literalEncoding.codes[256:literalCount]
 	b = b[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			lits = false
-			break
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
 		}
 	}
-	if lits {
-		a = t.litHist[:]
-		b = w.literalFreq[:len(a)]
-		for i := range a {
-			if b[i] == 0 && a[i] != 0 {
-				lits = false
-				break
-			}
+
+	a = t.litHist[:256]
+	b = w.literalEncoding.codes[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
 		}
 	}
-	return
+	return true
 }
 
 func (w *huffmanBitWriter) flush() {
@@ -205,8 +221,8 @@ func (w *huffmanBitWriter) write(b []byte) {
 	_, w.err = w.writer.Write(b)
 }
 
-func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
-	w.bits |= uint64(b) << (w.nbits & reg16SizeMask64)
+func (w *huffmanBitWriter) writeBits(b int32, nb uint8) {
+	w.bits |= uint64(b) << (w.nbits & 63)
 	w.nbits += nb
 	if w.nbits >= 48 {
 		w.writeOutBits()
@@ -244,9 +260,9 @@ func (w *huffmanBitWriter) writeBytes(bytes []byte) {
 // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
 // information. Code badCode is an end marker
 //
-//  numLiterals      The number of literals in literalEncoding
-//  numOffsets       The number of offsets in offsetEncoding
-//  litenc, offenc   The literal and offset encoder to use
+//	numLiterals      The number of literals in literalEncoding
+//	numOffsets       The number of offsets in offsetEncoding
+//	litenc, offenc   The literal and offset encoder to use
 func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) {
 	for i := range w.codegenFreq {
 		w.codegenFreq[i] = 0
@@ -259,12 +275,12 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 	// Copy the concatenated code sizes to codegen. Put a marker at the end.
 	cgnl := codegen[:numLiterals]
 	for i := range cgnl {
-		cgnl[i] = uint8(litEnc.codes[i].len)
+		cgnl[i] = litEnc.codes[i].len()
 	}
 
 	cgnl = codegen[numLiterals : numLiterals+numOffsets]
 	for i := range cgnl {
-		cgnl[i] = uint8(offEnc.codes[i].len)
+		cgnl[i] = offEnc.codes[i].len()
 	}
 	codegen[numLiterals+numOffsets] = badCode
 
@@ -407,8 +423,8 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
 
 func (w *huffmanBitWriter) writeCode(c hcode) {
 	// The function does not get inlined if we "& 63" the shift.
-	w.bits |= uint64(c.code) << w.nbits
-	w.nbits += c.len
+	w.bits |= c.code64() << (w.nbits & 63)
+	w.nbits += c.len()
 	if w.nbits >= 48 {
 		w.writeOutBits()
 	}
@@ -420,13 +436,11 @@ func (w *huffmanBitWriter) writeOutBits() {
 	w.bits >>= 48
 	w.nbits -= 48
 	n := w.nbytes
-	w.bytes[n] = byte(bits)
-	w.bytes[n+1] = byte(bits >> 8)
-	w.bytes[n+2] = byte(bits >> 16)
-	w.bytes[n+3] = byte(bits >> 24)
-	w.bytes[n+4] = byte(bits >> 32)
-	w.bytes[n+5] = byte(bits >> 40)
+
+	// We over-write, but faster...
+	binary.LittleEndian.PutUint64(w.bytes[n:], bits)
 	n += 6
+
 	if n >= bufferFlushSize {
 		if w.err != nil {
 			n = 0
@@ -435,14 +449,15 @@ func (w *huffmanBitWriter) writeOutBits() {
 		w.write(w.bytes[:n])
 		n = 0
 	}
+
 	w.nbytes = n
 }
 
 // Write the header of a dynamic Huffman block to the output stream.
 //
-//  numLiterals  The number of literals specified in codegen
-//  numOffsets   The number of offsets specified in codegen
-//  numCodegens  The number of codegens used in codegen
+//	numLiterals  The number of literals specified in codegen
+//	numOffsets   The number of offsets specified in codegen
+//	numCodegens  The number of codegens used in codegen
 func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
 	if w.err != nil {
 		return
@@ -457,7 +472,7 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n
 	w.writeBits(int32(numCodegens-4), 4)
 
 	for i := 0; i < numCodegens; i++ {
-		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len)
+		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len())
 		w.writeBits(int32(value), 3)
 	}
 
@@ -551,7 +566,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 		w.lastHeader = 0
 	}
 	numLiterals, numOffsets := w.indexTokens(tokens, false)
-	w.generate(tokens)
+	w.generate()
 	var extraBits int
 	storedSize, storable := w.storedSize(input)
 	if storable {
@@ -562,7 +577,10 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	// Fixed Huffman baseline.
 	var literalEncoding = fixedLiteralEncoding
 	var offsetEncoding = fixedOffsetEncoding
-	var size = w.fixedSize(extraBits)
+	var size = math.MaxInt32
+	if tokens.n < maxPredefinedTokens {
+		size = w.fixedSize(extraBits)
+	}
 
 	// Dynamic Huffman?
 	var numCodegens int
@@ -580,7 +598,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	}
 
 	// Stored bytes?
-	if storable && storedSize < size {
+	if storable && storedSize <= size {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
@@ -619,22 +637,39 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		w.lastHeader = 0
 		w.lastHuffMan = false
 	}
-	if !sync {
-		tokens.Fill()
+
+	// fillReuse enables filling of empty values.
+	// This will make encodings always reusable without testing.
+	// However, this does not appear to benefit on most cases.
+	const fillReuse = false
+
+	// Check if we can reuse...
+	if !fillReuse && w.lastHeader > 0 && !w.canReuse(tokens) {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
 	}
+
 	numLiterals, numOffsets := w.indexTokens(tokens, !sync)
+	extraBits := 0
+	ssize, storable := w.storedSize(input)
+
+	const usePrefs = true
+	if storable || w.lastHeader > 0 {
+		extraBits = w.extraBitSize()
+	}
 
 	var size int
+
 	// Check if we should reuse.
 	if w.lastHeader > 0 {
 		// Estimate size for using a new table.
 		// Use the previous header size as the best estimate.
 		newSize := w.lastHeader + tokens.EstimatedBits()
-		newSize += newSize >> w.logNewTablePenalty
+		newSize += int(w.literalEncoding.codes[endBlockMarker].len()) + newSize>>w.logNewTablePenalty
 
 		// The estimated size is calculated as an optimal table.
 		// We add a penalty to make it more realistic and re-use a bit more.
-		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + w.extraBitSize()
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits
 
 		// Check if a new table is better.
 		if newSize < reuseSize {
@@ -645,35 +680,83 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		} else {
 			size = reuseSize
 		}
+
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size {
+				// Check if we get a reasonable size decrease.
+				if storable && ssize <= size {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
 		// Check if we get a reasonable size decrease.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if storable && ssize <= size {
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 	}
 
 	// We want a new block/table
 	if w.lastHeader == 0 {
-		w.generate(tokens)
+		if fillReuse && !sync {
+			w.fillTokens()
+			numLiterals, numOffsets = maxNumLit, maxNumDist
+		} else {
+			w.literalFreq[endBlockMarker] = 1
+		}
+
+		w.generate()
 		// Generate codegen and codegenFrequencies, which indicates how to encode
 		// the literalEncoding and the offsetEncoding.
 		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
 		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+
 		var numCodegens int
-		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize())
-		// Store bytes, if we don't get a reasonable improvement.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if fillReuse && !sync {
+			// Reindex for accurate size...
+			w.indexTokens(tokens, true)
+		}
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+		// Store predefined, if we don't get a reasonable improvement.
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size {
+				// Store bytes, if we don't get an improvement.
+				if storable && ssize <= preSize {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+
+		if storable && ssize <= size {
+			// Store bytes, if we don't get an improvement.
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 
 		// Write Huffman table.
 		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-		w.lastHeader, _ = w.headerSize()
+		if !sync {
+			w.lastHeader, _ = w.headerSize()
+		}
 		w.lastHuffMan = false
 	}
 
@@ -684,14 +767,29 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
 }
 
+func (w *huffmanBitWriter) fillTokens() {
+	for i, v := range w.literalFreq[:literalCount] {
+		if v == 0 {
+			w.literalFreq[i] = 1
+		}
+	}
+	for i, v := range w.offsetFreq[:offsetCodeCount] {
+		if v == 0 {
+			w.offsetFreq[i] = 1
+		}
+	}
+}
+
 // indexTokens indexes a slice of tokens, and updates
 // literalFreq and offsetFreq, and generates literalEncoding
 // and offsetEncoding.
 // The number of literal and offset tokens is returned.
 func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) {
-	copy(w.literalFreq[:], t.litHist[:])
-	copy(w.literalFreq[256:], t.extraHist[:])
-	copy(w.offsetFreq[:], t.offHist[:offsetCodeCount])
+	//copy(w.literalFreq[:], t.litHist[:])
+	*(*[256]uint16)(w.literalFreq[:]) = t.litHist
+	//copy(w.literalFreq[256:], t.extraHist[:])
+	*(*[32]uint16)(w.literalFreq[256:]) = t.extraHist
+	w.offsetFreq = t.offHist
 
 	if t.n == 0 {
 		return
@@ -718,7 +816,7 @@ func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, num
 	return
 }
 
-func (w *huffmanBitWriter) generate(t *tokens) {
+func (w *huffmanBitWriter) generate() {
 	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
 	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
 }
@@ -745,52 +843,135 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 	offs := oeCodes[:32]
 	lengths := leCodes[lengthCodesStart:]
 	lengths = lengths[:32]
+
+	// Go 1.16 LOVES having these on stack.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
 	for _, t := range tokens {
-		if t < matchType {
-			w.writeCode(lits[t.literal()])
+		if t < 256 {
+			//w.writeCode(lits[t.literal()])
+			c := lits[t]
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 			continue
 		}
 
 		// Write the length
 		length := t.length()
-		lengthCode := lengthCode(length)
+		lengthCode := lengthCode(length) & 31
 		if false {
-			w.writeCode(lengths[lengthCode&31])
+			w.writeCode(lengths[lengthCode])
 		} else {
 			// inlined
-			c := lengths[lengthCode&31]
-			w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64)
-			w.nbits += c.len
-			if w.nbits >= 48 {
-				w.writeOutBits()
+			c := lengths[lengthCode]
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
 			}
 		}
 
-		extraLengthBits := uint16(lengthExtraBits[lengthCode&31])
-		if extraLengthBits > 0 {
-			extraLength := int32(length - lengthBase[lengthCode&31])
-			w.writeBits(extraLength, extraLengthBits)
+		if lengthCode >= lengthExtraBitsMinCode {
+			extraLengthBits := lengthExtraBits[lengthCode]
+			//w.writeBits(extraLength, extraLengthBits)
+			extraLength := int32(length - lengthBase[lengthCode])
+			bits |= uint64(extraLength) << (nbits & 63)
+			nbits += extraLengthBits
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 		}
 		// Write the offset
 		offset := t.offset()
-		offsetCode := offsetCode(offset)
+		offsetCode := (offset >> 16) & 31
 		if false {
-			w.writeCode(offs[offsetCode&31])
+			w.writeCode(offs[offsetCode])
 		} else {
 			// inlined
-			c := offs[offsetCode&31]
-			w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64)
-			w.nbits += c.len
-			if w.nbits >= 48 {
-				w.writeOutBits()
+			c := offs[offsetCode]
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
 			}
 		}
-		extraOffsetBits := uint16(offsetExtraBits[offsetCode&63])
-		if extraOffsetBits > 0 {
-			extraOffset := int32(offset - offsetBase[offsetCode&63])
-			w.writeBits(extraOffset, extraOffsetBits)
+
+		if offsetCode >= offsetExtraBitsMinCode {
+			offsetComb := offsetCombined[offsetCode]
+			//w.writeBits(extraOffset, extraOffsetBits)
+			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
+			nbits += uint8(offsetComb)
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 		}
 	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
 	if deferEOB {
 		w.writeCode(leCodes[endBlockMarker])
 	}
@@ -825,43 +1006,78 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		}
 	}
 
+	const numLiterals = endBlockMarker + 1
+	const numOffsets = 1
+
 	// Add everything as literals
 	// We have to estimate the header size.
 	// Assume header is around 70 bytes:
 	// https://stackoverflow.com/a/25454430
 	const guessHeaderSizeBits = 70 * 8
-	estBits, estExtra := histogramSize(input, w.literalFreq[:], !eof && !sync)
-	estBits += w.lastHeader + 15
-	if w.lastHeader == 0 {
-		estBits += guessHeaderSizeBits
+	histogram(input, w.literalFreq[:numLiterals])
+	ssize, storable := w.storedSize(input)
+	if storable && len(input) > 1024 {
+		// Quick check for incompressible content.
+		abs := float64(0)
+		avg := float64(len(input)) / 256
+		max := float64(len(input) * 2)
+		for _, v := range w.literalFreq[:256] {
+			diff := float64(v) - avg
+			abs += diff * diff
+			if abs > max {
+				break
+			}
+		}
+		if abs < max {
+			if debugDeflate {
+				fmt.Println("stored", abs, "<", max)
+			}
+			// No chance we can compress this...
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+	}
+	w.literalFreq[endBlockMarker] = 1
+	w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
+	estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals])
+	if estBits < math.MaxInt32 {
+		estBits += w.lastHeader
+		if w.lastHeader == 0 {
+			estBits += guessHeaderSizeBits
+		}
+		estBits += estBits >> w.logNewTablePenalty
 	}
-	estBits += estBits >> w.logNewTablePenalty
 
 	// Store bytes, if we don't get a reasonable improvement.
-	ssize, storable := w.storedSize(input)
-	if storable && ssize < estBits {
+	if storable && ssize <= estBits {
+		if debugDeflate {
+			fmt.Println("stored,", ssize, "<=", estBits)
+		}
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
 	}
 
 	if w.lastHeader > 0 {
-		reuseSize := w.literalEncoding.bitLength(w.literalFreq[:256])
-		estBits += estExtra
+		reuseSize := w.literalEncoding.canReuseBits(w.literalFreq[:256])
 
 		if estBits < reuseSize {
+			if debugDeflate {
+				fmt.Println("NOT reusing, reuse:", reuseSize/8, "> new:", estBits/8, "header est:", w.lastHeader/8, "bytes")
+			}
 			// We owe an EOB
 			w.writeCode(w.literalEncoding.codes[endBlockMarker])
 			w.lastHeader = 0
+		} else if debugDeflate {
+			fmt.Println("reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8)
 		}
 	}
 
-	const numLiterals = endBlockMarker + 1
-	const numOffsets = 1
+	count := 0
 	if w.lastHeader == 0 {
-		w.literalFreq[endBlockMarker] = 1
-		w.literalEncoding.generate(w.literalFreq[:numLiterals], 15)
-
+		// Use the temp encoding, so swap.
+		w.literalEncoding, w.tmpLitEncoding = w.tmpLitEncoding, w.literalEncoding
 		// Generate codegen and codegenFrequencies, which indicates how to encode
 		// the literalEncoding and the offsetEncoding.
 		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
@@ -872,39 +1088,94 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
 		w.lastHuffMan = true
 		w.lastHeader, _ = w.headerSize()
+		if debugDeflate {
+			count += w.lastHeader
+			fmt.Println("header:", count/8)
+		}
+	}
+
+	encoding := w.literalEncoding.codes[:256]
+	// Go 1.16 LOVES having these on stack. At least 1.5x the speed.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
+	if debugDeflate {
+		count -= int(nbytes)*8 + int(nbits)
+	}
+	// Unroll, write 3 codes/loop.
+	// Fastest number of unrolls.
+	for len(input) > 3 {
+		// We must have at least 48 bits free.
+		if nbits >= 8 {
+			n := nbits >> 3
+			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			bits >>= (n * 8) & 63
+			nbits -= n * 8
+			nbytes += n
+		}
+		if nbytes >= bufferFlushSize {
+			if w.err != nil {
+				nbytes = 0
+				return
+			}
+			if debugDeflate {
+				count += int(nbytes) * 8
+			}
+			_, w.err = w.writer.Write(w.bytes[:nbytes])
+			nbytes = 0
+		}
+		a, b := encoding[input[0]], encoding[input[1]]
+		bits |= a.code64() << (nbits & 63)
+		bits |= b.code64() << ((nbits + a.len()) & 63)
+		c := encoding[input[2]]
+		nbits += b.len() + a.len()
+		bits |= c.code64() << (nbits & 63)
+		nbits += c.len()
+		input = input[3:]
 	}
 
-	encoding := w.literalEncoding.codes[:257]
+	// Remaining...
 	for _, t := range input {
-		// Bitwriting inlined, ~30% speedup
-		c := encoding[t]
-		w.bits |= uint64(c.code) << ((w.nbits) & reg16SizeMask64)
-		w.nbits += c.len
-		if w.nbits >= 48 {
-			bits := w.bits
-			w.bits >>= 48
-			w.nbits -= 48
-			n := w.nbytes
-			w.bytes[n] = byte(bits)
-			w.bytes[n+1] = byte(bits >> 8)
-			w.bytes[n+2] = byte(bits >> 16)
-			w.bytes[n+3] = byte(bits >> 24)
-			w.bytes[n+4] = byte(bits >> 32)
-			w.bytes[n+5] = byte(bits >> 40)
-			n += 6
-			if n >= bufferFlushSize {
+		if nbits >= 48 {
+			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+			bits >>= 48
+			nbits -= 48
+			nbytes += 6
+			if nbytes >= bufferFlushSize {
 				if w.err != nil {
-					n = 0
+					nbytes = 0
 					return
 				}
-				w.write(w.bytes[:n])
-				n = 0
+				if debugDeflate {
+					count += int(nbytes) * 8
+				}
+				_, w.err = w.writer.Write(w.bytes[:nbytes])
+				nbytes = 0
 			}
-			w.nbytes = n
+		}
+		// Bitwriting inlined, ~30% speedup
+		c := encoding[t]
+		bits |= c.code64() << (nbits & 63)
+
+		nbits += c.len()
+		if debugDeflate {
+			count += int(c.len())
 		}
 	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
+	if debugDeflate {
+		nb := count + int(nbytes)*8 + int(nbits)
+		fmt.Println("wrote", nb, "bits,", nb/8, "bytes.")
+	}
+	// Flush if needed to have space.
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+
 	if eof || sync {
-		w.writeCode(encoding[endBlockMarker])
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
 		w.lastHeader = 0
 		w.lastHuffMan = false
 	}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
index 4c39a30187..be7b58b473 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -16,14 +16,28 @@ const (
 )
 
 // hcode is a huffman code with a bit code and bit length.
-type hcode struct {
-	code, len uint16
+type hcode uint32
+
+func (h hcode) len() uint8 {
+	return uint8(h)
+}
+
+func (h hcode) code64() uint64 {
+	return uint64(h >> 8)
+}
+
+func (h hcode) zero() bool {
+	return h == 0
 }
 
 type huffmanEncoder struct {
-	codes     []hcode
-	freqcache []literalNode
-	bitCount  [17]int32
+	codes    []hcode
+	bitCount [17]int32
+
+	// Allocate a reusable buffer with the longest possible frequency table.
+	// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
+	// The largest of these is literalCount, so we allocate for that case.
+	freqcache [literalCount + 1]literalNode
 }
 
 type literalNode struct {
@@ -52,9 +66,12 @@ type levelInfo struct {
 }
 
 // set sets the code and length of an hcode.
-func (h *hcode) set(code uint16, length uint16) {
-	h.len = length
-	h.code = code
+func (h *hcode) set(code uint16, length uint8) {
+	*h = hcode(length) | (hcode(code) << 8)
+}
+
+func newhcode(code uint16, length uint8) hcode {
+	return hcode(length) | (hcode(code) << 8)
 }
 
 func reverseBits(number uint16, bitLength byte) uint16 {
@@ -76,7 +93,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 	var ch uint16
 	for ch = 0; ch < literalCount; ch++ {
 		var bits uint16
-		var size uint16
+		var size uint8
 		switch {
 		case ch < 144:
 			// size 8, 000110000  .. 10111111
@@ -95,7 +112,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 			bits = ch + 192 - 280
 			size = 8
 		}
-		codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size}
+		codes[ch] = newhcode(reverseBits(bits, size), size)
 	}
 	return h
 }
@@ -104,7 +121,7 @@ func generateFixedOffsetEncoding() *huffmanEncoder {
 	h := newHuffmanEncoder(30)
 	codes := h.codes
 	for ch := range codes {
-		codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
+		codes[ch] = newhcode(reverseBits(uint16(ch), 5), 5)
 	}
 	return h
 }
@@ -116,7 +133,30 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	var total int
 	for i, f := range freq {
 		if f != 0 {
-			total += int(f) * int(h.codes[i].len)
+			total += int(f) * int(h.codes[i].len())
+		}
+	}
+	return total
+}
+
+func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
+	var total int
+	for _, f := range b {
+		total += int(h.codes[f].len())
+	}
+	return total
+}
+
+// canReuseBits returns the number of bits or math.MaxInt32 if the encoder cannot be reused.
+func (h *huffmanEncoder) canReuseBits(freq []uint16) int {
+	var total int
+	for i, f := range freq {
+		if f != 0 {
+			code := h.codes[i]
+			if code.zero() {
+				return math.MaxInt32
+			}
+			total += int(f) * int(code.len())
 		}
 	}
 	return total
@@ -128,13 +168,18 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
 // The cases of 0, 1, and 2 literals are handled by special case code.
 //
 // list  An array of the literals with non-zero frequencies
-//             and their associated frequencies. The array is in order of increasing
-//             frequency, and has as its last element a special element with frequency
-//             MaxInt32
+//
+//	and their associated frequencies. The array is in order of increasing
+//	frequency, and has as its last element a special element with frequency
+//	MaxInt32
+//
 // maxBits     The maximum number of bits that should be used to encode any literal.
-//             Must be less than 16.
+//
+//	Must be less than 16.
+//
 // return      An integer array in which array[i] indicates the number of literals
-//             that should be encoded in i bits.
+//
+//	that should be encoded in i bits.
 func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	if maxBits >= maxBitsLimit {
 		panic("flate: maxBits too large")
@@ -160,14 +205,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// of the level j ancestor.
 	var leafCounts [maxBitsLimit][maxBitsLimit]int32
 
+	// Descending to only have 1 bounds check.
+	l2f := int32(list[2].freq)
+	l1f := int32(list[1].freq)
+	l0f := int32(list[0].freq) + int32(list[1].freq)
+
 	for level := int32(1); level <= maxBits; level++ {
 		// For every level, the first two items are the first two characters.
 		// We initialize the levels as if we had already figured this out.
 		levels[level] = levelInfo{
 			level:        level,
-			lastFreq:     int32(list[1].freq),
-			nextCharFreq: int32(list[2].freq),
-			nextPairFreq: int32(list[0].freq) + int32(list[1].freq),
+			lastFreq:     l1f,
+			nextCharFreq: l2f,
+			nextPairFreq: l0f,
 		}
 		leafCounts[level][level] = 2
 		if level == 1 {
@@ -178,8 +228,8 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// We need a total of 2*n - 2 items at top level and have already generated 2.
 	levels[maxBits].needed = 2*n - 4
 
-	level := maxBits
-	for {
+	level := uint32(maxBits)
+	for level < 16 {
 		l := &levels[level]
 		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
 			// We've run out of both leafs and pairs.
@@ -211,7 +261,13 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 			// more values in the level below
 			l.lastFreq = l.nextPairFreq
 			// Take leaf counts from the lower level, except counts[level] remains the same.
-			copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			if true {
+				save := leafCounts[level][level]
+				leafCounts[level] = leafCounts[level-1]
+				leafCounts[level][level] = save
+			} else {
+				copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			}
 			levels[l.level-1].needed = 2
 		}
 
@@ -269,7 +325,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 
 		sortByLiteral(chunk)
 		for _, node := range chunk {
-			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
+			h.codes[node.literal] = newhcode(reverseBits(code, uint8(n)), uint8(n))
 			code++
 		}
 		list = list[0 : len(list)-int(bits)]
@@ -281,13 +337,8 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 // freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
 // maxBits  The maximum number of bits to use for any literal.
 func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
-	if h.freqcache == nil {
-		// Allocate a reusable buffer with the longest possible frequency table.
-		// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
-		// The largest of these is literalCount, so we allocate for that case.
-		h.freqcache = make([]literalNode, literalCount+1)
-	}
 	list := h.freqcache[:len(freq)+1]
+	codes := h.codes[:len(freq)]
 	// Number of non-zero literals
 	count := 0
 	// Set list to be the set of all non-zero literals and their frequencies
@@ -296,11 +347,10 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 			list[count] = literalNode{uint16(i), f}
 			count++
 		} else {
-			list[count] = literalNode{}
-			h.codes[i].len = 0
+			codes[i] = 0
 		}
 	}
-	list[len(freq)] = literalNode{}
+	list[count] = literalNode{}
 
 	list = list[:count]
 	if count <= 2 {
@@ -320,44 +370,48 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 	h.assignEncodingAndSize(bitCount, list)
 }
 
+// atLeastOne clamps the result between 1 and 15.
 func atLeastOne(v float32) float32 {
 	if v < 1 {
 		return 1
 	}
+	if v > 15 {
+		return 15
+	}
 	return v
 }
 
-// histogramSize accumulates a histogram of b in h.
-// An estimated size in bits is returned.
-// Unassigned values are assigned '1' in the histogram.
-// len(h) must be >= 256, and h's elements must be all zeroes.
-func histogramSize(b []byte, h []uint16, fill bool) (int, int) {
-	h = h[:256]
-	for _, t := range b {
-		h[t]++
-	}
-	invTotal := 1.0 / float32(len(b))
-	shannon := float32(0.0)
-	var extra float32
-	if fill {
-		oneBits := atLeastOne(-mFastLog2(invTotal))
-		for i, v := range h[:] {
-			if v > 0 {
-				n := float32(v)
-				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
-			} else {
-				h[i] = 1
-				extra += oneBits
-			}
-		}
+func histogram(b []byte, h []uint16) {
+	if true && len(b) >= 8<<10 {
+		// Split for bigger inputs
+		histogramSplit(b, h)
 	} else {
-		for _, v := range h[:] {
-			if v > 0 {
-				n := float32(v)
-				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
-			}
+		h = h[:256]
+		for _, t := range b {
+			h[t]++
 		}
 	}
+}
 
-	return int(shannon + 0.99), int(extra + 0.99)
+func histogramSplit(b []byte, h []uint16) {
+	// Tested, and slightly faster than 2-way.
+	// Writing to separate arrays and combining is also slightly slower.
+	h = h[:256]
+	for len(b)&3 != 0 {
+		h[b[0]]++
+		b = b[1:]
+	}
+	n := len(b) / 4
+	x, y, z, w := b[:n], b[n:], b[n+n:], b[n+n+n:]
+	y, z, w = y[:len(x)], z[:len(x)], w[:len(x)]
+	for i, t := range x {
+		v0 := &h[t]
+		v1 := &h[y[i]]
+		v3 := &h[w[i]]
+		v2 := &h[z[i]]
+		*v0++
+		*v1++
+		*v2++
+		*v3++
+	}
 }
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go b/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
index 2077802990..6c05ba8c1c 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
@@ -42,25 +42,6 @@ func quickSortByFreq(data []literalNode, a, b, maxDepth int) {
 	}
 }
 
-// siftDownByFreq implements the heap property on data[lo, hi).
-// first is an offset into the array where the root of the heap lies.
-func siftDownByFreq(data []literalNode, lo, hi, first int) {
-	root := lo
-	for {
-		child := 2*root + 1
-		if child >= hi {
-			break
-		}
-		if child+1 < hi && (data[first+child].freq == data[first+child+1].freq && data[first+child].literal < data[first+child+1].literal || data[first+child].freq < data[first+child+1].freq) {
-			child++
-		}
-		if data[first+root].freq == data[first+child].freq && data[first+root].literal > data[first+child].literal || data[first+root].freq > data[first+child].freq {
-			return
-		}
-		data[first+root], data[first+child] = data[first+child], data[first+root]
-		root = child
-	}
-}
 func doPivotByFreq(data []literalNode, lo, hi int) (midlo, midhi int) {
 	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
 	if hi-lo > 40 {
diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
index 16bc51408e..0d7b437f1c 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -9,10 +9,10 @@ package flate
 
 import (
 	"bufio"
+	"compress/flate"
 	"fmt"
 	"io"
 	"math/bits"
-	"strconv"
 	"sync"
 )
 
@@ -36,16 +36,19 @@ type lengthExtra struct {
 
 var decCodeToLen = [32]lengthExtra{{length: 0x0, extra: 0x0}, {length: 0x1, extra: 0x0}, {length: 0x2, extra: 0x0}, {length: 0x3, extra: 0x0}, {length: 0x4, extra: 0x0}, {length: 0x5, extra: 0x0}, {length: 0x6, extra: 0x0}, {length: 0x7, extra: 0x0}, {length: 0x8, extra: 0x1}, {length: 0xa, extra: 0x1}, {length: 0xc, extra: 0x1}, {length: 0xe, extra: 0x1}, {length: 0x10, extra: 0x2}, {length: 0x14, extra: 0x2}, {length: 0x18, extra: 0x2}, {length: 0x1c, extra: 0x2}, {length: 0x20, extra: 0x3}, {length: 0x28, extra: 0x3}, {length: 0x30, extra: 0x3}, {length: 0x38, extra: 0x3}, {length: 0x40, extra: 0x4}, {length: 0x50, extra: 0x4}, {length: 0x60, extra: 0x4}, {length: 0x70, extra: 0x4}, {length: 0x80, extra: 0x5}, {length: 0xa0, extra: 0x5}, {length: 0xc0, extra: 0x5}, {length: 0xe0, extra: 0x5}, {length: 0xff, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}}
 
+var bitMask32 = [32]uint32{
+	0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF,
+	0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF,
+	0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF,
+	0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF,
+} // up to 32 bits
+
 // Initialize the fixedHuffmanDecoder only once upon first use.
 var fixedOnce sync.Once
 var fixedHuffmanDecoder huffmanDecoder
 
 // A CorruptInputError reports the presence of corrupt input at a given offset.
-type CorruptInputError int64
-
-func (e CorruptInputError) Error() string {
-	return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10)
-}
+type CorruptInputError = flate.CorruptInputError
 
 // An InternalError reports an error in the flate code itself.
 type InternalError string
@@ -55,26 +58,12 @@ func (e InternalError) Error() string { return "flate: internal error: " + strin
 // A ReadError reports an error encountered while reading input.
 //
 // Deprecated: No longer returned.
-type ReadError struct {
-	Offset int64 // byte offset where error occurred
-	Err    error // error returned by underlying Read
-}
-
-func (e *ReadError) Error() string {
-	return "flate: read error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
-}
+type ReadError = flate.ReadError
 
 // A WriteError reports an error encountered while writing output.
 //
 // Deprecated: No longer returned.
-type WriteError struct {
-	Offset int64 // byte offset where error occurred
-	Err    error // error returned by underlying Write
-}
-
-func (e *WriteError) Error() string {
-	return "flate: write error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
-}
+type WriteError = flate.WriteError
 
 // Resetter resets a ReadCloser returned by NewReader or NewReaderDict to
 // to switch to a new underlying Reader. This permits reusing a ReadCloser
@@ -131,8 +120,9 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 	const sanity = false
 
 	if h.chunks == nil {
-		h.chunks = &[huffmanNumChunks]uint16{}
+		h.chunks = new([huffmanNumChunks]uint16)
 	}
+
 	if h.maxRead != 0 {
 		*h = huffmanDecoder{chunks: h.chunks, links: h.links}
 	}
@@ -186,6 +176,7 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 	}
 
 	h.maxRead = min
+
 	chunks := h.chunks[:]
 	for i := range chunks {
 		chunks[i] = 0
@@ -213,8 +204,7 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 			if cap(h.links[off]) < numLinks {
 				h.links[off] = make([]uint16, numLinks)
 			} else {
-				links := h.links[off][:0]
-				h.links[off] = links[:numLinks]
+				h.links[off] = h.links[off][:numLinks]
 			}
 		}
 	} else {
@@ -288,7 +278,7 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 	return true
 }
 
-// The actual read interface needed by NewReader.
+// Reader is the actual read interface needed by NewReader.
 // If the passed in io.Reader does not also have ReadByte,
 // the NewReader will introduce its own buffering.
 type Reader interface {
@@ -296,6 +286,26 @@ type Reader interface {
 	io.ByteReader
 }
 
+type step uint8
+
+const (
+	copyData step = iota + 1
+	nextBlock
+	huffmanBytesBuffer
+	huffmanBytesReader
+	huffmanBufioReader
+	huffmanStringsReader
+	huffmanGenericReader
+)
+
+// flushMode tells decompressor when to return data
+type flushMode uint8
+
+const (
+	syncFlush    flushMode = iota // return data after sync flush block
+	partialFlush                  // return data after each block
+)
+
 // Decompress state.
 type decompressor struct {
 	// Input source.
@@ -314,7 +324,7 @@ type decompressor struct {
 
 	// Next step in the decompression,
 	// and decompression state.
-	step      func(*decompressor)
+	step      step
 	stepState int
 	err       error
 	toRead    []byte
@@ -330,6 +340,8 @@ type decompressor struct {
 
 	nb    uint
 	final bool
+
+	flushMode flushMode
 }
 
 func (f *decompressor) nextBlock() {
@@ -346,11 +358,17 @@ func (f *decompressor) nextBlock() {
 	switch typ {
 	case 0:
 		f.dataBlock()
+		if debugDecode {
+			fmt.Println("stored block")
+		}
 	case 1:
 		// compressed, fixed Huffman tables
 		f.hl = &fixedHuffmanDecoder
 		f.hd = nil
-		f.huffmanBlockDecoder()()
+		f.huffmanBlockDecoder()
+		if debugDecode {
+			fmt.Println("predefinied huffman block")
+		}
 	case 2:
 		// compressed, dynamic Huffman tables
 		if f.err = f.readHuffman(); f.err != nil {
@@ -358,7 +376,10 @@ func (f *decompressor) nextBlock() {
 		}
 		f.hl = &f.h1
 		f.hd = &f.h2
-		f.huffmanBlockDecoder()()
+		f.huffmanBlockDecoder()
+		if debugDecode {
+			fmt.Println("dynamic huffman block")
+		}
 	default:
 		// 3 is reserved.
 		if debugDecode {
@@ -381,14 +402,16 @@ func (f *decompressor) Read(b []byte) (int, error) {
 		if f.err != nil {
 			return 0, f.err
 		}
-		f.step(f)
+
+		f.doStep()
+
 		if f.err != nil && len(f.toRead) == 0 {
 			f.toRead = f.dict.readFlush() // Flush what's left in case of error
 		}
 	}
 }
 
-// Support the io.WriteTo interface for io.Copy and friends.
+// WriteTo implements the io.WriteTo interface for io.Copy and friends.
 func (f *decompressor) WriteTo(w io.Writer) (int64, error) {
 	total := int64(0)
 	flushed := false
@@ -412,7 +435,7 @@ func (f *decompressor) WriteTo(w io.Writer) (int64, error) {
 			return total, f.err
 		}
 		if f.err == nil {
-			f.step(f)
+			f.doStep()
 		}
 		if len(f.toRead) == 0 && f.err != nil && !flushed {
 			f.toRead = f.dict.readFlush() // Flush what's left in case of error
@@ -568,221 +591,6 @@ func (f *decompressor) readHuffman() error {
 	return nil
 }
 
-// Decode a single Huffman block from f.
-// hl and hd are the Huffman states for the lit/length values
-// and the distance values, respectively. If hd == nil, using the
-// fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) huffmanBlockGeneric() {
-	const (
-		stateInit = iota // Zero value must be stateInit
-		stateDict
-	)
-
-	switch f.stepState {
-	case stateInit:
-		goto readLiteral
-	case stateDict:
-		goto copyHistory
-	}
-
-readLiteral:
-	// Read literal and/or (length, distance) according to RFC section 3.2.3.
-	{
-		var v int
-		{
-			// Inlined v, err := f.huffSym(f.hl)
-			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
-			// with single element, huffSym must error on these two edge cases. In both
-			// cases, the chunks slice will be 0 for the invalid sequence, leading it
-			// satisfy the n == 0 check below.
-			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
-			for {
-				for nb < n {
-					c, err := f.r.ReadByte()
-					if err != nil {
-						f.b = b
-						f.nb = nb
-						f.err = noEOF(err)
-						return
-					}
-					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
-				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
-				n = uint(chunk & huffmanCountMask)
-				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
-					n = uint(chunk & huffmanCountMask)
-				}
-				if n <= nb {
-					if n == 0 {
-						f.b = b
-						f.nb = nb
-						if debugDecode {
-							fmt.Println("huffsym: n==0")
-						}
-						f.err = CorruptInputError(f.roffset)
-						return
-					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
-					v = int(chunk >> huffmanValueShift)
-					break
-				}
-			}
-		}
-
-		var n uint // number of bits extra
-		var length int
-		var err error
-		switch {
-		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBlockGeneric
-				f.stepState = stateInit
-				return
-			}
-			goto readLiteral
-		case v == 256:
-			f.finishBlock()
-			return
-		// otherwise, reference to older data
-		case v < 265:
-			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
-		case v < maxNumLit:
-			length = 258
-			n = 0
-		default:
-			if debugDecode {
-				fmt.Println(v, ">= maxNumLit")
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-		if n > 0 {
-			for f.nb < n {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits n>0:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
-		}
-
-		var dist uint32
-		if f.hd == nil {
-			for f.nb < 5 {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<5:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
-		} else {
-			sym, err := f.huffSym(f.hd)
-			if err != nil {
-				if debugDecode {
-					fmt.Println("huffsym:", err)
-				}
-				f.err = err
-				return
-			}
-			dist = uint32(sym)
-		}
-
-		switch {
-		case dist < 4:
-			dist++
-		case dist < maxNumDist:
-			nb := uint(dist-2) >> 1
-			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<nb:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
-			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
-		default:
-			if debugDecode {
-				fmt.Println("dist too big:", dist, maxNumDist)
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
-			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		f.copyLen, f.copyDist = length, int(dist)
-		goto copyHistory
-	}
-
-copyHistory:
-	// Perform a backwards copy according to RFC section 3.2.3.
-	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
-		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
-		}
-		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work
-			f.stepState = stateDict
-			return
-		}
-		goto readLiteral
-	}
-}
-
 // Copy a single uncompressed data block from input to output.
 func (f *decompressor) dataBlock() {
 	// Uncompressed.
@@ -820,7 +628,10 @@ func (f *decompressor) dataBlock() {
 	}
 
 	if n == 0 {
-		f.toRead = f.dict.readFlush()
+		if f.flushMode == syncFlush {
+			f.toRead = f.dict.readFlush()
+		}
+
 		f.finishBlock()
 		return
 	}
@@ -848,7 +659,7 @@ func (f *decompressor) copyData() {
 
 	if f.dict.availWrite() == 0 || f.copyLen > 0 {
 		f.toRead = f.dict.readFlush()
-		f.step = (*decompressor).copyData
+		f.step = copyData
 		return
 	}
 	f.finishBlock()
@@ -859,9 +670,34 @@ func (f *decompressor) finishBlock() {
 		if f.dict.availRead() > 0 {
 			f.toRead = f.dict.readFlush()
 		}
+
 		f.err = io.EOF
+	} else if f.flushMode == partialFlush && f.dict.availRead() > 0 {
+		f.toRead = f.dict.readFlush()
+	}
+
+	f.step = nextBlock
+}
+
+func (f *decompressor) doStep() {
+	switch f.step {
+	case copyData:
+		f.copyData()
+	case nextBlock:
+		f.nextBlock()
+	case huffmanBytesBuffer:
+		f.huffmanBytesBuffer()
+	case huffmanBytesReader:
+		f.huffmanBytesReader()
+	case huffmanBufioReader:
+		f.huffmanBufioReader()
+	case huffmanStringsReader:
+		f.huffmanStringsReader()
+	case huffmanGenericReader:
+		f.huffmanGenericReader()
+	default:
+		panic("BUG: unexpected step state")
 	}
-	f.step = (*decompressor).nextBlock
 }
 
 // noEOF returns err, unless err == io.EOF, in which case it returns io.ErrUnexpectedEOF.
@@ -964,12 +800,47 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error {
 		h1:       f.h1,
 		h2:       f.h2,
 		dict:     f.dict,
-		step:     (*decompressor).nextBlock,
+		step:     nextBlock,
 	}
 	f.dict.init(maxMatchOffset, dict)
 	return nil
 }
 
+type ReaderOpt func(*decompressor)
+
+// WithPartialBlock tells decompressor to return after each block,
+// so it can read data written with partial flush
+func WithPartialBlock() ReaderOpt {
+	return func(f *decompressor) {
+		f.flushMode = partialFlush
+	}
+}
+
+// WithDict initializes the reader with a preset dictionary
+func WithDict(dict []byte) ReaderOpt {
+	return func(f *decompressor) {
+		f.dict.init(maxMatchOffset, dict)
+	}
+}
+
+// NewReaderOpts returns new reader with provided options
+func NewReaderOpts(r io.Reader, opts ...ReaderOpt) io.ReadCloser {
+	fixedHuffmanDecoderInit()
+
+	var f decompressor
+	f.r = makeReader(r)
+	f.bits = new([maxNumLit + maxNumDist]int)
+	f.codebits = new([numCodes]int)
+	f.step = nextBlock
+	f.dict.init(maxMatchOffset, nil)
+
+	for _, opt := range opts {
+		opt(&f)
+	}
+
+	return &f
+}
+
 // NewReader returns a new ReadCloser that can be used
 // to read the uncompressed version of r.
 // If r does not also implement io.ByteReader,
@@ -979,15 +850,7 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error {
 //
 // The ReadCloser returned by NewReader also implements Resetter.
 func NewReader(r io.Reader) io.ReadCloser {
-	fixedHuffmanDecoderInit()
-
-	var f decompressor
-	f.r = makeReader(r)
-	f.bits = new([maxNumLit + maxNumDist]int)
-	f.codebits = new([numCodes]int)
-	f.step = (*decompressor).nextBlock
-	f.dict.init(maxMatchOffset, nil)
-	return &f
+	return NewReaderOpts(r)
 }
 
 // NewReaderDict is like NewReader but initializes the reader
@@ -998,13 +861,5 @@ func NewReader(r io.Reader) io.ReadCloser {
 //
 // The ReadCloser returned by NewReader also implements Resetter.
 func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
-	fixedHuffmanDecoderInit()
-
-	var f decompressor
-	f.r = makeReader(r)
-	f.bits = new([maxNumLit + maxNumDist]int)
-	f.codebits = new([numCodes]int)
-	f.step = (*decompressor).nextBlock
-	f.dict.init(maxMatchOffset, dict)
-	return &f
+	return NewReaderOpts(r, WithDict(dict))
 }
diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
index cc6db27925..2b2f993f75 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate_gen.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@@ -21,6 +21,11 @@ func (f *decompressor) huffmanBytesBuffer() {
 	)
 	fr := f.r.(*bytes.Buffer)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -39,41 +44,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -83,15 +82,17 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBytesBuffer
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = huffmanBytesBuffer
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -101,9 +102,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -111,25 +113,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -137,12 +141,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -152,38 +156,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -197,9 +198,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -207,14 +209,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -223,9 +227,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -238,20 +243,22 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = huffmanBytesBuffer // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -265,6 +272,11 @@ func (f *decompressor) huffmanBytesReader() {
 	)
 	fr := f.r.(*bytes.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -283,41 +295,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -327,15 +333,17 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBytesReader
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = huffmanBytesReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -345,9 +353,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -355,25 +364,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -381,12 +392,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -396,38 +407,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -441,9 +449,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -451,14 +460,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -467,9 +478,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -482,20 +494,22 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = huffmanBytesReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -509,6 +523,11 @@ func (f *decompressor) huffmanBufioReader() {
 	)
 	fr := f.r.(*bufio.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -527,41 +546,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -571,15 +584,17 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBufioReader
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = huffmanBufioReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -589,9 +604,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -599,25 +615,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -625,12 +643,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -640,38 +658,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -685,9 +700,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -695,14 +711,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -711,9 +729,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -726,20 +745,22 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = huffmanBufioReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -753,6 +774,11 @@ func (f *decompressor) huffmanStringsReader() {
 	)
 	fr := f.r.(*strings.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -771,41 +797,286 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = huffmanStringsReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = huffmanStringsReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanGenericReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -815,15 +1086,17 @@ readLiteral:
 		var length int
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanStringsReader
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = huffmanGenericReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -833,9 +1106,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -843,25 +1117,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -869,12 +1145,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -884,38 +1160,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -929,9 +1202,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -939,14 +1213,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -955,9 +1231,10 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
@@ -970,33 +1247,37 @@ readLiteral:
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = huffmanGenericReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
-func (f *decompressor) huffmanBlockDecoder() func() {
+func (f *decompressor) huffmanBlockDecoder() {
 	switch f.r.(type) {
 	case *bytes.Buffer:
-		return f.huffmanBytesBuffer
+		f.huffmanBytesBuffer()
 	case *bytes.Reader:
-		return f.huffmanBytesReader
+		f.huffmanBytesReader()
 	case *bufio.Reader:
-		return f.huffmanBufioReader
+		f.huffmanBufioReader()
 	case *strings.Reader:
-		return f.huffmanStringsReader
+		f.huffmanStringsReader()
+	case Reader:
+		f.huffmanGenericReader()
 	default:
-		return f.huffmanBlockGeneric
+		f.huffmanGenericReader()
 	}
 }
diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go
index 1e5eea3968..703b9a89aa 100644
--- a/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@@ -1,6 +1,10 @@
 package flate
 
-import "fmt"
+import (
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+)
 
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
@@ -15,6 +19,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
 	)
 	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
@@ -64,7 +69,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 	sLimit := int32(len(src) - inputMargin)
 
 	// nextEmit is where in src the next emitLiteral should start from.
-	cv := load3232(src, s)
+	cv := load6432(src, s)
 
 	for {
 		const skipLog = 5
@@ -73,7 +78,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash(cv)
+			nextHash := hashLen(cv, tableBits, hashBytes)
 			candidate = e.table[nextHash]
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
@@ -82,16 +87,16 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 
 			now := load6432(src, nextS)
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
-			nextHash = hash(uint32(now))
+			nextHash = hashLen(now, tableBits, hashBytes)
 
 			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
 
 			// Do one right away...
-			cv = uint32(now)
+			cv = now
 			s = nextS
 			nextS++
 			candidate = e.table[nextHash]
@@ -99,11 +104,11 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
 
 			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
-			cv = uint32(now)
+			cv = now
 			s = nextS
 		}
 
@@ -116,7 +121,32 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 
 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset - e.cur
-			l := e.matchlenLong(s+4, t+4, src) + 4
+			var l = int32(4)
+			if false {
+				l = e.matchlenLong(s+4, t+4, src) + 4
+			} else {
+				// inlined:
+				a := src[s+4:]
+				b := src[t+4:]
+				for len(a) >= 8 {
+					if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+						l += int32(bits.TrailingZeros64(diff) >> 3)
+						break
+					}
+					l += 8
+					a = a[8:]
+					b = b[8:]
+				}
+				if len(a) < 8 {
+					b = b[:len(a)]
+					for i := range a {
+						if a[i] != b[i] {
+							break
+						}
+						l++
+					}
+				}
+			}
 
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
@@ -125,11 +155,43 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			// Save the match found
-			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			if false {
+				dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			} else {
+				// Inlined...
+				xoffset := uint32(s - t - baseMatchOffset)
+				xlength := l
+				oc := offsetCode(xoffset)
+				xoffset |= oc << 16
+				for xlength > 0 {
+					xl := xlength
+					if xl > 258 {
+						if xl > 258+baseMatchLength {
+							xl = 258
+						} else {
+							xl = 258 - baseMatchLength
+						}
+					}
+					xlength -= xl
+					xl -= baseMatchLength
+					dst.extraHist[lengthCodes1[uint8(xl)]]++
+					dst.offHist[oc]++
+					dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+					dst.n++
+				}
+			}
 			s += l
 			nextEmit = s
 			if nextS >= s {
@@ -137,9 +199,9 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			}
 			if s >= sLimit {
 				// Index first pair after match end.
-				if int(s+l+4) < len(src) {
-					cv := load3232(src, s)
-					e.table[hash(cv)] = tableEntry{offset: s + e.cur}
+				if int(s+l+8) < len(src) {
+					cv := load6432(src, s)
+					e.table[hashLen(cv, tableBits, hashBytes)] = tableEntry{offset: s + e.cur}
 				}
 				goto emitRemainder
 			}
@@ -152,16 +214,16 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			// three load32 calls.
 			x := load6432(src, s-2)
 			o := e.cur + s - 2
-			prevHash := hash(uint32(x))
+			prevHash := hashLen(x, tableBits, hashBytes)
 			e.table[prevHash] = tableEntry{offset: o}
 			x >>= 16
-			currHash := hash(uint32(x))
+			currHash := hashLen(x, tableBits, hashBytes)
 			candidate = e.table[currHash]
 			e.table[currHash] = tableEntry{offset: o + 2}
 
 			offset := s - (candidate.offset - e.cur)
 			if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) {
-				cv = uint32(x >> 8)
+				cv = x >> 8
 				s++
 				break
 			}
diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go
index 5b986a1944..876dfbe305 100644
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@@ -16,6 +16,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
 	)
 
 	if debugDeflate && e.cur < 0 {
@@ -66,7 +67,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 	sLimit := int32(len(src) - inputMargin)
 
 	// nextEmit is where in src the next emitLiteral should start from.
-	cv := load3232(src, s)
+	cv := load6432(src, s)
 	for {
 		// When should we start skipping if we haven't found matches in a long while.
 		const skipLog = 5
@@ -75,7 +76,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash4u(cv, bTableBits)
+			nextHash := hashLen(cv, bTableBits, hashBytes)
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
@@ -84,16 +85,16 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			candidate = e.table[nextHash]
 			now := load6432(src, nextS)
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
-			nextHash = hash4u(uint32(now), bTableBits)
+			nextHash = hashLen(now, bTableBits, hashBytes)
 
 			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
 
 			// Do one right away...
-			cv = uint32(now)
+			cv = now
 			s = nextS
 			nextS++
 			candidate = e.table[nextHash]
@@ -101,10 +102,10 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
 
 			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				break
 			}
-			cv = uint32(now)
+			cv = now
 		}
 
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
@@ -134,7 +135,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
@@ -146,25 +155,25 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 
 			if s >= sLimit {
 				// Index first pair after match end.
-				if int(s+l+4) < len(src) {
-					cv := load3232(src, s)
-					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur}
+				if int(s+l+8) < len(src) {
+					cv := load6432(src, s)
+					e.table[hashLen(cv, bTableBits, hashBytes)] = tableEntry{offset: s + e.cur}
 				}
 				goto emitRemainder
 			}
 
 			// Store every second hash in-between, but offset by 1.
 			for i := s - l + 2; i < s-5; i += 7 {
-				x := load6432(src, int32(i))
-				nextHash := hash4u(uint32(x), bTableBits)
+				x := load6432(src, i)
+				nextHash := hashLen(x, bTableBits, hashBytes)
 				e.table[nextHash] = tableEntry{offset: e.cur + i}
 				// Skip one
 				x >>= 16
-				nextHash = hash4u(uint32(x), bTableBits)
+				nextHash = hashLen(x, bTableBits, hashBytes)
 				e.table[nextHash] = tableEntry{offset: e.cur + i + 2}
 				// Skip one
 				x >>= 16
-				nextHash = hash4u(uint32(x), bTableBits)
+				nextHash = hashLen(x, bTableBits, hashBytes)
 				e.table[nextHash] = tableEntry{offset: e.cur + i + 4}
 			}
 
@@ -176,17 +185,17 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			// three load32 calls.
 			x := load6432(src, s-2)
 			o := e.cur + s - 2
-			prevHash := hash4u(uint32(x), bTableBits)
-			prevHash2 := hash4u(uint32(x>>8), bTableBits)
+			prevHash := hashLen(x, bTableBits, hashBytes)
+			prevHash2 := hashLen(x>>8, bTableBits, hashBytes)
 			e.table[prevHash] = tableEntry{offset: o}
 			e.table[prevHash2] = tableEntry{offset: o + 1}
-			currHash := hash4u(uint32(x>>16), bTableBits)
+			currHash := hashLen(x>>16, bTableBits, hashBytes)
 			candidate = e.table[currHash]
 			e.table[currHash] = tableEntry{offset: o + 2}
 
 			offset := s - (candidate.offset - e.cur)
 			if offset > maxMatchOffset || uint32(x>>16) != load3232(src, candidate.offset-e.cur) {
-				cv = uint32(x >> 24)
+				cv = x >> 24
 				s++
 				break
 			}
diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go
index c22b4244a5..7aa2b72a12 100644
--- a/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@@ -5,14 +5,17 @@ import "fmt"
 // fastEncL3
 type fastEncL3 struct {
 	fastGen
-	table [tableSize]tableEntryPrev
+	table [1 << 16]tableEntryPrev
 }
 
 // Encode uses a similar algorithm to level 2, will check up to two candidates.
 func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 	const (
-		inputMargin            = 8 - 1
+		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		tableBits              = 16
+		tableSize              = 1 << tableBits
+		hashBytes              = 5
 	)
 
 	if debugDeflate && e.cur < 0 {
@@ -67,20 +70,20 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 	sLimit := int32(len(src) - inputMargin)
 
 	// nextEmit is where in src the next emitLiteral should start from.
-	cv := load3232(src, s)
+	cv := load6432(src, s)
 	for {
-		const skipLog = 6
+		const skipLog = 7
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash(cv)
+			nextHash := hashLen(cv, tableBits, hashBytes)
 			s = nextS
 			nextS = s + 1 + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			candidates := e.table[nextHash]
-			now := load3232(src, nextS)
+			now := load6432(src, nextS)
 
 			// Safe offset distance until s + 4...
 			minOffset := e.cur + s - (maxMatchOffset - 4)
@@ -94,8 +97,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				continue
 			}
 
-			if cv == load3232(src, candidate.offset-e.cur) {
-				if candidates.Prev.offset < minOffset || cv != load3232(src, candidates.Prev.offset-e.cur) {
+			if uint32(cv) == load3232(src, candidate.offset-e.cur) {
+				if candidates.Prev.offset < minOffset || uint32(cv) != load3232(src, candidates.Prev.offset-e.cur) {
 					break
 				}
 				// Both match and are valid, pick longest.
@@ -110,7 +113,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
-				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
+				if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 					break
 				}
 			}
@@ -141,7 +144,15 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
@@ -154,9 +165,9 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			if s >= sLimit {
 				t += l
 				// Index first pair after match end.
-				if int(t+4) < len(src) && t > 0 {
-					cv := load3232(src, t)
-					nextHash := hash(cv)
+				if int(t+8) < len(src) && t > 0 {
+					cv = load6432(src, t)
+					nextHash := hashLen(cv, tableBits, hashBytes)
 					e.table[nextHash] = tableEntryPrev{
 						Prev: e.table[nextHash].Cur,
 						Cur:  tableEntry{offset: e.cur + t},
@@ -165,32 +176,33 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				goto emitRemainder
 			}
 
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-3 to s.
-			x := load6432(src, s-3)
-			prevHash := hash(uint32(x))
-			e.table[prevHash] = tableEntryPrev{
-				Prev: e.table[prevHash].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 3},
+			// Store every 5th hash in-between.
+			for i := s - l + 2; i < s-5; i += 6 {
+				nextHash := hashLen(load6432(src, i), tableBits, hashBytes)
+				e.table[nextHash] = tableEntryPrev{
+					Prev: e.table[nextHash].Cur,
+					Cur:  tableEntry{offset: e.cur + i}}
 			}
-			x >>= 8
-			prevHash = hash(uint32(x))
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s.
+			x := load6432(src, s-2)
+			prevHash := hashLen(x, tableBits, hashBytes)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 2},
 			}
 			x >>= 8
-			prevHash = hash(uint32(x))
+			prevHash = hashLen(x, tableBits, hashBytes)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 1},
 			}
 			x >>= 8
-			currHash := hash(uint32(x))
+			currHash := hashLen(x, tableBits, hashBytes)
 			candidates := e.table[currHash]
-			cv = uint32(x)
+			cv = x
 			e.table[currHash] = tableEntryPrev{
 				Prev: candidates.Cur,
 				Cur:  tableEntry{offset: s + e.cur},
@@ -200,18 +212,18 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			candidate = candidates.Cur
 			minOffset := e.cur + s - (maxMatchOffset - 4)
 
-			if candidate.offset > minOffset && cv != load3232(src, candidate.offset-e.cur) {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
+			if candidate.offset > minOffset {
+				if uint32(cv) == load3232(src, candidate.offset-e.cur) {
+					// Found a match...
+					continue
+				}
 				candidate = candidates.Prev
-				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
-					offset := s - (candidate.offset - e.cur)
-					if offset <= maxMatchOffset {
-						continue
-					}
+				if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
+					// Match at prev...
+					continue
 				}
 			}
-			cv = uint32(x >> 8)
+			cv = x >> 8
 			s++
 			break
 		}
diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go
index e62f0c02b1..23c08b325c 100644
--- a/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@@ -12,6 +12,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
 	)
 	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
@@ -80,7 +81,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var t int32
 		for {
-			nextHashS := hash4x64(cv, tableBits)
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
 			nextHashL := hash7(cv, tableBits)
 
 			s = nextS
@@ -135,7 +136,15 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if debugDeflate {
 			if t >= s {
@@ -160,7 +169,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			// Index first pair after match end.
 			if int(s+8) < len(src) {
 				cv := load6432(src, s)
-				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur}
 				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur}
 			}
 			goto emitRemainder
@@ -175,7 +184,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 				t2 := tableEntry{offset: t.offset + 1}
 				e.bTable[hash7(cv, tableBits)] = t
 				e.bTable[hash7(cv>>8, tableBits)] = t2
-				e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+				e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
 
 				i += 3
 				for ; i < s-1; i += 3 {
@@ -184,7 +193,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 					t2 := tableEntry{offset: t.offset + 1}
 					e.bTable[hash7(cv, tableBits)] = t
 					e.bTable[hash7(cv>>8, tableBits)] = t2
-					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
 				}
 			}
 		}
@@ -193,7 +202,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		// compression we first update the hash table at s-1 and at s.
 		x := load6432(src, s-1)
 		o := e.cur + s - 1
-		prevHashS := hash4x64(x, tableBits)
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
 		prevHashL := hash7(x, tableBits)
 		e.table[prevHashS] = tableEntry{offset: o}
 		e.bTable[prevHashL] = tableEntry{offset: o}
diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go
index d513f1ffd3..1f61ec1829 100644
--- a/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@@ -12,6 +12,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
 	)
 	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
@@ -88,7 +89,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		var l int32
 		var t int32
 		for {
-			nextHashS := hash4x64(cv, tableBits)
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
 			nextHashL := hash7(cv, tableBits)
 
 			s = nextS
@@ -105,7 +106,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			eLong := &e.bTable[nextHashL]
 			eLong.Cur, eLong.Prev = entry, eLong.Cur
 
-			nextHashS = hash4x64(next, tableBits)
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
 			nextHashL = hash7(next, tableBits)
 
 			t = lCandidate.Cur.offset - e.cur
@@ -182,12 +183,346 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 
-		// Extend the 4-byte match as long as possible.
 		if l == 0 {
+			// Extend the 4-byte match as long as possible.
 			l = e.matchlenLong(s+4, t+4, src) + 4
 		} else if l == maxMatchLength {
 			l += e.matchlenLong(s+l, t+l, src)
 		}
+
+		// Try to locate a better match by checking the end of best match...
+		if sAt := s + l; l < 30 && sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset
+			t2 := eLong - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if t2 >= 0 && off < maxMatchOffset && off > 0 {
+				if l2 := e.matchlenLong(s2, t2, src); l2 > l {
+					t = t2
+					l = l2
+					s = s2
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+		}
+		if debugDeflate {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between.
+		if true {
+			const hashEvery = 3
+			i := s - l + 1
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// Do an long at i+1
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				eLong = &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// We only have enough bits for a short entry at i+2
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+
+				// Skip one - otherwise we risk hitting 's'
+				i += 4
+				for ; i < s-1; i += hashEvery {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
+					eLong := &e.bTable[hash7(cv, tableBits)]
+					eLong.Cur, eLong.Prev = t, eLong.Cur
+					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		eLong := &e.bTable[prevHashL]
+		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
+
+// fastEncL5Window is a level 5 encoder,
+// but with a custom window size.
+type fastEncL5Window struct {
+	hist      []byte
+	cur       int32
+	maxOffset int32
+	table     [tableSize]tableEntry
+	bTable    [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL5Window) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+	maxMatchOffset := e.maxOffset
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+						l = e.matchlen(s+4, t+4, src) + 4
+						ml1 := e.matchlen(s+4, t2+4, src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+				// Found a 4 match...
+				l = e.matchlen(s+4, t+4, src) + 4
+				lCandidate = e.bTable[nextHashL]
+				// Store the next match
+
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// If the next long is a candidate, use that...
+				t2 := lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		if l == 0 {
+			// Extend the 4-byte match as long as possible.
+			l = e.matchlenLong(s+4, t+4, src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(s+l, t+l, src)
+		}
+
+		// Try to locate a better match by checking the end of best match...
+		if sAt := s + l; l < 30 && sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset
+			t2 := eLong - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if t2 >= 0 && off < maxMatchOffset && off > 0 {
+				if l2 := e.matchlenLong(s2, t2, src); l2 > l {
+					t = t2
+					l = l2
+					s = s2
+				}
+			}
+		}
+
 		// Extend backwards
 		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 			s--
@@ -195,7 +530,15 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if debugDeflate {
 			if t >= s {
@@ -227,7 +570,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			if i < s-1 {
 				cv := load6432(src, i)
 				t := tableEntry{offset: i + e.cur}
-				e.table[hash4x64(cv, tableBits)] = t
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong.Cur, eLong.Prev = t, eLong.Cur
 
@@ -240,7 +583,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 				// We only have enough bits for a short entry at i+2
 				cv >>= 8
 				t = tableEntry{offset: t.offset + 1}
-				e.table[hash4x64(cv, tableBits)] = t
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
 
 				// Skip one - otherwise we risk hitting 's'
 				i += 4
@@ -250,7 +593,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 					t2 := tableEntry{offset: t.offset + 1}
 					eLong := &e.bTable[hash7(cv, tableBits)]
 					eLong.Cur, eLong.Prev = t, eLong.Cur
-					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
 				}
 			}
 		}
@@ -259,7 +602,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		// compression we first update the hash table at s-1 and at s.
 		x := load6432(src, s-1)
 		o := e.cur + s - 1
-		prevHashS := hash4x64(x, tableBits)
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
 		prevHashL := hash7(x, tableBits)
 		e.table[prevHashS] = tableEntry{offset: o}
 		eLong := &e.bTable[prevHashL]
@@ -277,3 +620,89 @@ emitRemainder:
 		emitLiteral(dst, src[nextEmit:])
 	}
 }
+
+// Reset the encoding table.
+func (e *fastEncL5Window) Reset() {
+	// We keep the same allocs, since we are compressing the same block sizes.
+	if cap(e.hist) < allocHistory {
+		e.hist = make([]byte, 0, allocHistory)
+	}
+
+	// We offset current position so everything will be out of reach.
+	// If we are above the buffer reset it will be cleared anyway since len(hist) == 0.
+	if e.cur <= int32(bufferReset) {
+		e.cur += e.maxOffset + int32(len(e.hist))
+	}
+	e.hist = e.hist[:0]
+}
+
+func (e *fastEncL5Window) addBlock(src []byte) int32 {
+	// check if we have space already
+	maxMatchOffset := e.maxOffset
+
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.hist = make([]byte, 0, allocHistory)
+		} else {
+			if cap(e.hist) < int(maxMatchOffset*2) {
+				panic("unexpected buffer size")
+			}
+			// Move down
+			offset := int32(len(e.hist)) - maxMatchOffset
+			copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:maxMatchOffset]
+		}
+	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
+
+// matchlen will return the match length between offsets and t in src.
+// The maximum length returned is maxMatchLength - 4.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastEncL5Window) matchlen(s, t int32, src []byte) int32 {
+	if debugDecode {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > e.maxOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	s1 := int(s) + maxMatchLength - 4
+	if s1 > len(src) {
+		s1 = len(src)
+	}
+
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:s1], src[t:]))
+}
+
+// matchlenLong will return the match length between offsets and t in src.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastEncL5Window) matchlenLong(s, t int32, src []byte) int32 {
+	if debugDeflate {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > e.maxOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:], src[t:]))
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go
index a52c80ea45..f1e9d98fa5 100644
--- a/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@@ -12,6 +12,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
 	)
 	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
@@ -90,7 +91,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 		var l int32
 		var t int32
 		for {
-			nextHashS := hash4x64(cv, tableBits)
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
 			nextHashL := hash7(cv, tableBits)
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
@@ -107,7 +108,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			eLong.Cur, eLong.Prev = entry, eLong.Cur
 
 			// Calculate hashes of 'next'
-			nextHashS = hash4x64(next, tableBits)
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
 			nextHashL = hash7(next, tableBits)
 
 			t = lCandidate.Cur.offset - e.cur
@@ -211,6 +212,40 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			l += e.matchlenLong(s+l, t+l, src)
 		}
 
+		// Try to locate a better match by checking the end-of-match...
+		if sAt := s + l; sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := &e.bTable[hash7(load6432(src, sAt), tableBits)]
+			// Test current
+			t2 := eLong.Cur.offset - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if off < maxMatchOffset {
+				if off > 0 && t2 >= 0 {
+					if l2 := e.matchlenLong(s2, t2, src); l2 > l {
+						t = t2
+						l = l2
+						s = s2
+					}
+				}
+				// Test next:
+				t2 = eLong.Prev.offset - e.cur - l + skipBeginning
+				off := s2 - t2
+				if off > 0 && off < maxMatchOffset && t2 >= 0 {
+					if l2 := e.matchlenLong(s2, t2, src); l2 > l {
+						t = t2
+						l = l2
+						s = s2
+					}
+				}
+			}
+		}
+
 		// Extend backwards
 		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 			s--
@@ -218,7 +253,15 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if false {
 			if t >= s {
@@ -244,7 +287,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			// Index after match end.
 			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
 				cv := load6432(src, i)
-				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur}
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur
 			}
@@ -259,7 +302,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				t2 := tableEntry{offset: t.offset + 1}
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
-				e.table[hash4x64(cv, tableBits)] = t
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
 				eLong.Cur, eLong.Prev = t, eLong.Cur
 				eLong2.Cur, eLong2.Prev = t2, eLong2.Cur
 			}
diff --git a/vendor/github.com/klauspost/compress/flate/matchlen_amd64.go b/vendor/github.com/klauspost/compress/flate/matchlen_amd64.go
new file mode 100644
index 0000000000..4bd3885841
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/matchlen_amd64.go
@@ -0,0 +1,16 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package flate
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+//
+//	len(a) <= len(b) and len(a) > 0
+//
+//go:noescape
+func matchLen(a []byte, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/flate/matchlen_amd64.s b/vendor/github.com/klauspost/compress/flate/matchlen_amd64.s
new file mode 100644
index 0000000000..0782b86e3d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/matchlen_amd64.s
@@ -0,0 +1,66 @@
+// Copied from S2 implementation.
+
+//go:build !appengine && !noasm && gc && !noasm
+
+#include "textflag.h"
+
+// func matchLen(a []byte, b []byte) int
+TEXT ·matchLen(SB), NOSPLIT, $0-56
+	MOVQ a_base+0(FP), AX
+	MOVQ b_base+24(FP), CX
+	MOVQ a_len+8(FP), DX
+
+	// matchLen
+	XORL SI, SI
+	CMPL DX, $0x08
+	JB   matchlen_match4_standalone
+
+matchlen_loopback_standalone:
+	MOVQ (AX)(SI*1), BX
+	XORQ (CX)(SI*1), BX
+	JZ   matchlen_loop_standalone
+
+#ifdef GOAMD64_v3
+	TZCNTQ BX, BX
+#else
+	BSFQ BX, BX
+#endif
+	SHRL $0x03, BX
+	LEAL (SI)(BX*1), SI
+	JMP  gen_match_len_end
+
+matchlen_loop_standalone:
+	LEAL -8(DX), DX
+	LEAL 8(SI), SI
+	CMPL DX, $0x08
+	JAE  matchlen_loopback_standalone
+
+matchlen_match4_standalone:
+	CMPL DX, $0x04
+	JB   matchlen_match2_standalone
+	MOVL (AX)(SI*1), BX
+	CMPL (CX)(SI*1), BX
+	JNE  matchlen_match2_standalone
+	LEAL -4(DX), DX
+	LEAL 4(SI), SI
+
+matchlen_match2_standalone:
+	CMPL DX, $0x02
+	JB   matchlen_match1_standalone
+	MOVW (AX)(SI*1), BX
+	CMPW (CX)(SI*1), BX
+	JNE  matchlen_match1_standalone
+	LEAL -2(DX), DX
+	LEAL 2(SI), SI
+
+matchlen_match1_standalone:
+	CMPL DX, $0x01
+	JB   gen_match_len_end
+	MOVB (AX)(SI*1), BL
+	CMPB (CX)(SI*1), BL
+	JNE  gen_match_len_end
+	INCL SI
+
+gen_match_len_end:
+	MOVQ SI, ret+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/flate/matchlen_generic.go b/vendor/github.com/klauspost/compress/flate/matchlen_generic.go
new file mode 100644
index 0000000000..ad5cd814b9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/matchlen_generic.go
@@ -0,0 +1,33 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package flate
+
+import (
+	"encoding/binary"
+	"math/bits"
+)
+
+// matchLen returns the maximum common prefix length of a and b.
+// a must be the shortest of the two.
+func matchLen(a, b []byte) (n int) {
+	for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
+		diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
+		if diff != 0 {
+			return n + bits.TrailingZeros64(diff)>>3
+		}
+		n += 8
+	}
+
+	for i := range a {
+		if a[i] != b[i] {
+			break
+		}
+		n++
+	}
+	return n
+
+}
diff --git a/vendor/github.com/klauspost/compress/flate/regmask_other.go b/vendor/github.com/klauspost/compress/flate/regmask_other.go
index f477a5d6e5..1b7a2cbd79 100644
--- a/vendor/github.com/klauspost/compress/flate/regmask_other.go
+++ b/vendor/github.com/klauspost/compress/flate/regmask_other.go
@@ -1,4 +1,5 @@
-//+build !amd64
+//go:build !amd64
+// +build !amd64
 
 package flate
 
diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go
index 53e8991246..f3d4139ef3 100644
--- a/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@@ -59,9 +59,9 @@ var bitWriterPool = sync.Pool{
 	},
 }
 
-// StatelessDeflate allows to compress directly to a Writer without retaining state.
+// StatelessDeflate allows compressing directly to a Writer without retaining state.
 // When returning everything will be flushed.
-// Up to 8KB of an optional dictionary can be given which is presumed to presumed to precede the block.
+// Up to 8KB of an optional dictionary can be given which is presumed to precede the block.
 // Longer dictionaries will be truncated and will still produce valid output.
 // Sending nil dictionary is perfectly fine.
 func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
@@ -86,11 +86,19 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
 		dict = dict[len(dict)-maxStatelessDict:]
 	}
 
+	// For subsequent loops, keep shallow dict reference to avoid alloc+copy.
+	var inDict []byte
+
 	for len(in) > 0 {
 		todo := in
-		if len(todo) > maxStatelessBlock-len(dict) {
+		if len(inDict) > 0 {
+			if len(todo) > maxStatelessBlock-maxStatelessDict {
+				todo = todo[:maxStatelessBlock-maxStatelessDict]
+			}
+		} else if len(todo) > maxStatelessBlock-len(dict) {
 			todo = todo[:maxStatelessBlock-len(dict)]
 		}
+		inOrg := in
 		in = in[len(todo):]
 		uncompressed := todo
 		if len(dict) > 0 {
@@ -102,7 +110,11 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
 			todo = combined
 		}
 		// Compress
-		statelessEnc(&dst, todo, int16(len(dict)))
+		if len(inDict) == 0 {
+			statelessEnc(&dst, todo, int16(len(dict)))
+		} else {
+			statelessEnc(&dst, inDict[:maxStatelessDict+len(todo)], maxStatelessDict)
+		}
 		isEof := eof && len(in) == 0
 
 		if dst.n == 0 {
@@ -119,7 +131,8 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
 		}
 		if len(in) > 0 {
 			// Retain a dict if we have more
-			dict = todo[len(todo)-maxStatelessDict:]
+			inDict = inOrg[len(uncompressed)-maxStatelessDict:]
+			dict = nil
 			dst.Reset()
 		}
 		if bw.err != nil {
@@ -249,7 +262,15 @@ func statelessEnc(dst *tokens, src []byte, startAt int16) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			// Save the match found
diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go
index f9abf606d6..d818790c13 100644
--- a/vendor/github.com/klauspost/compress/flate/token.go
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@@ -13,14 +13,16 @@ import (
 )
 
 const (
-	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
-	// 8 bits:   xlength = length - MIN_MATCH_LENGTH
-	// 22 bits   xoffset = offset - MIN_OFFSET_SIZE, or literal
-	lengthShift = 22
-	offsetMask  = 1<<lengthShift - 1
-	typeMask    = 3 << 30
-	literalType = 0 << 30
-	matchType   = 1 << 30
+	// bits 0-16  	xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits
+	// bits 16-22	offsetcode - 5 bits
+	// bits 22-30   xlength = length - MIN_MATCH_LENGTH - 8 bits
+	// bits 30-32   type   0 = literal  1=EOF  2=Match   3=Unused - 2 bits
+	lengthShift         = 22
+	offsetMask          = 1<<lengthShift - 1
+	typeMask            = 3 << 30
+	literalType         = 0 << 30
+	matchType           = 1 << 30
+	matchOffsetOnlyMask = 0xffff
 )
 
 // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
@@ -126,11 +128,11 @@ var offsetCodes14 = [256]uint32{
 type token uint32
 
 type tokens struct {
-	nLits     int
 	extraHist [32]uint16  // codes 256->maxnumlit
 	offHist   [32]uint16  // offset codes
 	litHist   [256]uint16 // codes 0->255
-	n         uint16      // Must be able to contain maxStoreBlockSize
+	nFilled   int
+	n         uint16 // Must be able to contain maxStoreBlockSize
 	tokens    [maxStoreBlockSize + 1]token
 }
 
@@ -139,7 +141,7 @@ func (t *tokens) Reset() {
 		return
 	}
 	t.n = 0
-	t.nLits = 0
+	t.nFilled = 0
 	for i := range t.litHist[:] {
 		t.litHist[i] = 0
 	}
@@ -158,12 +160,12 @@ func (t *tokens) Fill() {
 	for i, v := range t.litHist[:] {
 		if v == 0 {
 			t.litHist[i] = 1
-			t.nLits++
+			t.nFilled++
 		}
 	}
 	for i, v := range t.extraHist[:literalCount-256] {
 		if v == 0 {
-			t.nLits++
+			t.nFilled++
 			t.extraHist[i] = 1
 		}
 	}
@@ -187,26 +189,23 @@ func (t *tokens) indexTokens(in []token) {
 			t.AddLiteral(tok.literal())
 			continue
 		}
-		t.AddMatch(uint32(tok.length()), tok.offset())
+		t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask)
 	}
 }
 
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 func emitLiteral(dst *tokens, lit []byte) {
-	ol := int(dst.n)
-	for i, v := range lit {
-		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
+	for _, v := range lit {
+		dst.tokens[dst.n] = token(v)
 		dst.litHist[v]++
+		dst.n++
 	}
-	dst.n += uint16(len(lit))
-	dst.nLits += len(lit)
 }
 
 func (t *tokens) AddLiteral(lit byte) {
 	t.tokens[t.n] = token(lit)
 	t.litHist[lit]++
 	t.n++
-	t.nLits++
 }
 
 // from https://stackoverflow.com/a/28730362
@@ -227,12 +226,13 @@ func (t *tokens) EstimatedBits() int {
 	shannon := float32(0)
 	bits := int(0)
 	nMatches := 0
-	if t.nLits > 0 {
-		invTotal := 1.0 / float32(t.nLits)
+	total := int(t.n) + t.nFilled
+	if total > 0 {
+		invTotal := 1.0 / float32(total)
 		for _, v := range t.litHist[:] {
 			if v > 0 {
 				n := float32(v)
-				shannon += -mFastLog2(n*invTotal) * n
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
 			}
 		}
 		// Just add 15 for EOB
@@ -240,7 +240,7 @@ func (t *tokens) EstimatedBits() int {
 		for i, v := range t.extraHist[1 : literalCount-256] {
 			if v > 0 {
 				n := float32(v)
-				shannon += -mFastLog2(n*invTotal) * n
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
 				bits += int(lengthExtraBits[i&31]) * int(v)
 				nMatches += int(v)
 			}
@@ -251,7 +251,7 @@ func (t *tokens) EstimatedBits() int {
 		for i, v := range t.offHist[:offsetCodeCount] {
 			if v > 0 {
 				n := float32(v)
-				shannon += -mFastLog2(n*invTotal) * n
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
 				bits += int(offsetExtraBits[i&31]) * int(v)
 			}
 		}
@@ -270,11 +270,12 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
 			panic(fmt.Errorf("invalid offset: %v", xoffset))
 		}
 	}
-	t.nLits++
-	lengthCode := lengthCodes1[uint8(xlength)] & 31
+	oCode := offsetCode(xoffset)
+	xoffset |= oCode << 16
+
+	t.extraHist[lengthCodes1[uint8(xlength)]]++
+	t.offHist[oCode&31]++
 	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
-	t.extraHist[lengthCode]++
-	t.offHist[offsetCode(xoffset)&31]++
 	t.n++
 }
 
@@ -286,20 +287,23 @@ func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
 			panic(fmt.Errorf("invalid offset: %v", xoffset))
 		}
 	}
-	oc := offsetCode(xoffset) & 31
+	oc := offsetCode(xoffset)
+	xoffset |= oc << 16
 	for xlength > 0 {
 		xl := xlength
 		if xl > 258 {
 			// We need to have at least baseMatchLength left over for next loop.
-			xl = 258 - baseMatchLength
+			if xl > 258+baseMatchLength {
+				xl = 258
+			} else {
+				xl = 258 - baseMatchLength
+			}
 		}
 		xlength -= xl
-		xl -= 3
-		t.nLits++
-		lengthCode := lengthCodes1[uint8(xl)] & 31
+		xl -= baseMatchLength
+		t.extraHist[lengthCodes1[uint8(xl)]]++
+		t.offHist[oc&31]++
 		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
-		t.extraHist[lengthCode]++
-		t.offHist[oc]++
 		t.n++
 	}
 }
@@ -354,8 +358,8 @@ func (t token) offset() uint32 { return uint32(t) & offsetMask }
 
 func (t token) length() uint8 { return uint8(t >> lengthShift) }
 
-// The code is never more than 8 bits, but is returned as uint32 for convenience.
-func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) }
+// Convert length to code.
+func lengthCode(len uint8) uint8 { return lengthCodes[len] }
 
 // Returns the offset code corresponding to a specific offset
 func offsetCode(off uint32) uint32 {
diff --git a/vendor/github.com/klauspost/compress/fse/bitwriter.go b/vendor/github.com/klauspost/compress/fse/bitwriter.go
index 43e463611b..e82fa3bb7b 100644
--- a/vendor/github.com/klauspost/compress/fse/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/fse/bitwriter.go
@@ -152,12 +152,11 @@ func (b *bitWriter) flushAlign() {
 
 // close will write the alignment bit and write the final byte(s)
 // to the output.
-func (b *bitWriter) close() error {
+func (b *bitWriter) close() {
 	// End mark
 	b.addBits16Clean(1, 1)
 	// flush until next byte.
 	b.flushAlign()
-	return nil
 }
 
 // reset and continue writing by appending to out.
diff --git a/vendor/github.com/klauspost/compress/fse/compress.go b/vendor/github.com/klauspost/compress/fse/compress.go
index b69237c9b8..074018d8f9 100644
--- a/vendor/github.com/klauspost/compress/fse/compress.go
+++ b/vendor/github.com/klauspost/compress/fse/compress.go
@@ -92,7 +92,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, tableLog uint8, first symbolTra
 	im := int32((nbBitsOut << 16) - first.deltaNbBits)
 	lu := (im >> nbBitsOut) + first.deltaFindState
 	c.state = c.stateTable[lu]
-	return
 }
 
 // encode the output symbol provided and write it to the bitstream.
@@ -147,54 +146,51 @@ func (s *Scratch) compress(src []byte) error {
 		c1.encodeZero(tt[src[ip-2]])
 		ip -= 2
 	}
+	src = src[:ip]
 
 	// Main compression loop.
 	switch {
 	case !s.zeroBits && s.actualTableLog <= 8:
 		// We can encode 4 symbols without requiring a flush.
 		// We do not need to check if any output is 0 bits.
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encode(tt[v0])
 			c1.encode(tt[v1])
 			c2.encode(tt[v2])
 			c1.encode(tt[v3])
-			ip -= 4
 		}
 	case !s.zeroBits:
 		// We do not need to check if any output is 0 bits.
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encode(tt[v0])
 			c1.encode(tt[v1])
 			s.bw.flush32()
 			c2.encode(tt[v2])
 			c1.encode(tt[v3])
-			ip -= 4
 		}
 	case s.actualTableLog <= 8:
 		// We can encode 4 symbols without requiring a flush
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encodeZero(tt[v0])
 			c1.encodeZero(tt[v1])
 			c2.encodeZero(tt[v2])
 			c1.encodeZero(tt[v3])
-			ip -= 4
 		}
 	default:
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encodeZero(tt[v0])
 			c1.encodeZero(tt[v1])
 			s.bw.flush32()
 			c2.encodeZero(tt[v2])
 			c1.encodeZero(tt[v3])
-			ip -= 4
 		}
 	}
 
@@ -203,7 +199,8 @@ func (s *Scratch) compress(src []byte) error {
 	c2.flush(s.actualTableLog)
 	c1.flush(s.actualTableLog)
 
-	return s.bw.close()
+	s.bw.close()
+	return nil
 }
 
 // writeCount will write the normalized histogram count to header.
@@ -215,7 +212,7 @@ func (s *Scratch) writeCount() error {
 		previous0 bool
 		charnum   uint16
 
-		maxHeaderSize = ((int(s.symbolLen) * int(tableLog)) >> 3) + 3
+		maxHeaderSize = ((int(s.symbolLen)*int(tableLog) + 4 + 2) >> 3) + 3
 
 		// Write Table Size
 		bitStream = uint32(tableLog - minTablelog)
@@ -301,7 +298,7 @@ func (s *Scratch) writeCount() error {
 	out[outP+1] = byte(bitStream >> 8)
 	outP += (bitCount + 7) / 8
 
-	if uint16(charnum) > s.symbolLen {
+	if charnum > s.symbolLen {
 		return errors.New("internal error: charnum > s.symbolLen")
 	}
 	s.Out = out[:outP]
@@ -331,7 +328,7 @@ type cTable struct {
 func (s *Scratch) allocCtable() {
 	tableSize := 1 << s.actualTableLog
 	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < int(tableSize) {
+	if cap(s.ct.tableSymbol) < tableSize {
 		s.ct.tableSymbol = make([]byte, tableSize)
 	}
 	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
@@ -460,15 +457,17 @@ func (s *Scratch) countSimple(in []byte) (max int) {
 	for _, v := range in {
 		s.count[v]++
 	}
-	m := uint32(0)
+	m, symlen := uint32(0), s.symbolLen
 	for i, v := range s.count[:] {
+		if v == 0 {
+			continue
+		}
 		if v > m {
 			m = v
 		}
-		if v > 0 {
-			s.symbolLen = uint16(i) + 1
-		}
+		symlen = uint16(i) + 1
 	}
+	s.symbolLen = symlen
 	return int(m)
 }
 
@@ -565,8 +564,8 @@ func (s *Scratch) normalizeCount2() error {
 		distributed  uint32
 		total        = uint32(s.br.remain())
 		tableLog     = s.actualTableLog
-		lowThreshold = uint32(total >> tableLog)
-		lowOne       = uint32((total * 3) >> (tableLog + 1))
+		lowThreshold = total >> tableLog
+		lowOne       = (total * 3) >> (tableLog + 1)
 	)
 	for i, cnt := range s.count[:s.symbolLen] {
 		if cnt == 0 {
@@ -591,7 +590,7 @@ func (s *Scratch) normalizeCount2() error {
 
 	if (total / toDistribute) > lowOne {
 		// risk of rounding to zero
-		lowOne = uint32((total * 3) / (toDistribute * 2))
+		lowOne = (total * 3) / (toDistribute * 2)
 		for i, cnt := range s.count[:s.symbolLen] {
 			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
 				s.norm[i] = 1
diff --git a/vendor/github.com/klauspost/compress/fse/decompress.go b/vendor/github.com/klauspost/compress/fse/decompress.go
index 413ec3b3cd..0c7dd4ffef 100644
--- a/vendor/github.com/klauspost/compress/fse/decompress.go
+++ b/vendor/github.com/klauspost/compress/fse/decompress.go
@@ -15,7 +15,7 @@ const (
 // It is possible, but by no way guaranteed that corrupt data will
 // return an error.
 // It is up to the caller to verify integrity of the returned data.
-// Use a predefined Scrach to set maximum acceptable output size.
+// Use a predefined Scratch to set maximum acceptable output size.
 func Decompress(b []byte, s *Scratch) ([]byte, error) {
 	s, err := s.prepare(b)
 	if err != nil {
@@ -172,7 +172,7 @@ type decSymbol struct {
 // allocDtable will allocate decoding tables if they are not big enough.
 func (s *Scratch) allocDtable() {
 	tableSize := 1 << s.actualTableLog
-	if cap(s.decTable) < int(tableSize) {
+	if cap(s.decTable) < tableSize {
 		s.decTable = make([]decSymbol, tableSize)
 	}
 	s.decTable = s.decTable[:tableSize]
@@ -260,7 +260,9 @@ func (s *Scratch) buildDtable() error {
 // If the buffer is over-read an error is returned.
 func (s *Scratch) decompress() error {
 	br := &s.bits
-	br.init(s.br.unread())
+	if err := br.init(s.br.unread()); err != nil {
+		return err
+	}
 
 	var s1, s2 decoder
 	// Initialize and decode first state and symbol.
@@ -340,7 +342,7 @@ type decoder struct {
 func (d *decoder) init(in *bitReader, dt []decSymbol, tableLog uint8) {
 	d.dt = dt
 	d.br = in
-	d.state = uint16(in.getBits(tableLog))
+	d.state = in.getBits(tableLog)
 }
 
 // next returns the next symbol and sets the next state.
diff --git a/vendor/github.com/klauspost/compress/gen.sh b/vendor/github.com/klauspost/compress/gen.sh
new file mode 100644
index 0000000000..aff942205f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/gen.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cd s2/cmd/_s2sx/ || exit 1
+go generate .
diff --git a/vendor/github.com/klauspost/compress/gzip/gunzip.go b/vendor/github.com/klauspost/compress/gzip/gunzip.go
index 568b5d4fb8..00a0a2c386 100644
--- a/vendor/github.com/klauspost/compress/gzip/gunzip.go
+++ b/vendor/github.com/klauspost/compress/gzip/gunzip.go
@@ -8,8 +8,8 @@ package gzip
 
 import (
 	"bufio"
+	"compress/gzip"
 	"encoding/binary"
-	"errors"
 	"hash/crc32"
 	"io"
 	"time"
@@ -30,9 +30,9 @@ const (
 
 var (
 	// ErrChecksum is returned when reading GZIP data that has an invalid checksum.
-	ErrChecksum = errors.New("gzip: invalid checksum")
+	ErrChecksum = gzip.ErrChecksum
 	// ErrHeader is returned when reading GZIP data that has an invalid header.
-	ErrHeader = errors.New("gzip: invalid header")
+	ErrHeader = gzip.ErrHeader
 )
 
 var le = binary.LittleEndian
@@ -75,6 +75,7 @@ type Header struct {
 type Reader struct {
 	Header       // valid after NewReader or Reader.Reset
 	r            flate.Reader
+	br           *bufio.Reader
 	decompressor io.ReadCloser
 	digest       uint32 // CRC-32, IEEE polynomial (section 8)
 	size         uint32 // Uncompressed size (section 2.3.1)
@@ -105,11 +106,18 @@ func (z *Reader) Reset(r io.Reader) error {
 	*z = Reader{
 		decompressor: z.decompressor,
 		multistream:  true,
+		br:           z.br,
 	}
 	if rr, ok := r.(flate.Reader); ok {
 		z.r = rr
 	} else {
-		z.r = bufio.NewReader(r)
+		// Reuse if we can.
+		if z.br != nil {
+			z.br.Reset(r)
+		} else {
+			z.br = bufio.NewReader(r)
+		}
+		z.r = z.br
 	}
 	z.Header, z.err = z.readHeader()
 	return z.err
@@ -230,6 +238,11 @@ func (z *Reader) readHeader() (hdr Header, err error) {
 		}
 	}
 
+	// Reserved FLG bits must be zero.
+	if flg>>5 != 0 {
+		return hdr, ErrHeader
+	}
+
 	z.digest = 0
 	if z.decompressor == nil {
 		z.decompressor = flate.NewReader(z.r)
@@ -245,48 +258,71 @@ func (z *Reader) Read(p []byte) (n int, err error) {
 		return 0, z.err
 	}
 
-	n, z.err = z.decompressor.Read(p)
-	z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n])
-	z.size += uint32(n)
-	if z.err != io.EOF {
-		// In the normal case we return here.
-		return n, z.err
-	}
+	for n == 0 {
+		n, z.err = z.decompressor.Read(p)
+		z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n])
+		z.size += uint32(n)
+		if z.err != io.EOF {
+			// In the normal case we return here.
+			return n, z.err
+		}
 
-	// Finished file; check checksum and size.
-	if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil {
-		z.err = noEOF(err)
-		return n, z.err
-	}
-	digest := le.Uint32(z.buf[:4])
-	size := le.Uint32(z.buf[4:8])
-	if digest != z.digest || size != z.size {
-		z.err = ErrChecksum
-		return n, z.err
-	}
-	z.digest, z.size = 0, 0
+		// Finished file; check checksum and size.
+		if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil {
+			z.err = noEOF(err)
+			return n, z.err
+		}
+		digest := le.Uint32(z.buf[:4])
+		size := le.Uint32(z.buf[4:8])
+		if digest != z.digest || size != z.size {
+			z.err = ErrChecksum
+			return n, z.err
+		}
+		z.digest, z.size = 0, 0
 
-	// File is ok; check if there is another.
-	if !z.multistream {
-		return n, io.EOF
-	}
-	z.err = nil // Remove io.EOF
+		// File is ok; check if there is another.
+		if !z.multistream {
+			return n, io.EOF
+		}
+		z.err = nil // Remove io.EOF
 
-	if _, z.err = z.readHeader(); z.err != nil {
-		return n, z.err
+		if _, z.err = z.readHeader(); z.err != nil {
+			return n, z.err
+		}
 	}
 
-	// Read from next file, if necessary.
-	if n > 0 {
-		return n, nil
-	}
-	return z.Read(p)
+	return n, nil
 }
 
-// Support the io.WriteTo interface for io.Copy and friends.
+type crcer interface {
+	io.Writer
+	Sum32() uint32
+	Reset()
+}
+type crcUpdater struct {
+	z *Reader
+}
+
+func (c *crcUpdater) Write(p []byte) (int, error) {
+	c.z.digest = crc32.Update(c.z.digest, crc32.IEEETable, p)
+	return len(p), nil
+}
+
+func (c *crcUpdater) Sum32() uint32 {
+	return c.z.digest
+}
+
+func (c *crcUpdater) Reset() {
+	c.z.digest = 0
+}
+
+// WriteTo support the io.WriteTo interface for io.Copy and friends.
 func (z *Reader) WriteTo(w io.Writer) (int64, error) {
 	total := int64(0)
-	crcWriter := crc32.NewIEEE()
+	crcWriter := crcer(crc32.NewIEEE())
+	if z.digest != 0 {
+		crcWriter = &crcUpdater{z: z}
+	}
 	for {
 		if z.err != nil {
 			if z.err == io.EOF {
diff --git a/vendor/github.com/klauspost/compress/gzip/gzip.go b/vendor/github.com/klauspost/compress/gzip/gzip.go
index 26203851bd..5bc720593e 100644
--- a/vendor/github.com/klauspost/compress/gzip/gzip.go
+++ b/vendor/github.com/klauspost/compress/gzip/gzip.go
@@ -74,6 +74,27 @@ func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
 	return z, nil
 }
 
+// MinCustomWindowSize is the minimum window size that can be sent to NewWriterWindow.
+const MinCustomWindowSize = flate.MinCustomWindowSize
+
+// MaxCustomWindowSize is the maximum custom window that can be sent to NewWriterWindow.
+const MaxCustomWindowSize = flate.MaxCustomWindowSize
+
+// NewWriterWindow returns a new Writer compressing data with a custom window size.
+// windowSize must be from MinCustomWindowSize to MaxCustomWindowSize.
+func NewWriterWindow(w io.Writer, windowSize int) (*Writer, error) {
+	if windowSize < MinCustomWindowSize {
+		return nil, errors.New("gzip: requested window size less than MinWindowSize")
+	}
+	if windowSize > MaxCustomWindowSize {
+		return nil, errors.New("gzip: requested window size bigger than MaxCustomWindowSize")
+	}
+
+	z := new(Writer)
+	z.init(w, -windowSize)
+	return z, nil
+}
+
 func (z *Writer) init(w io.Writer, level int) {
 	compressor := z.compressor
 	if level != StatelessCompression {
diff --git a/vendor/github.com/klauspost/compress/huff0/README.md b/vendor/github.com/klauspost/compress/huff0/README.md
index e12da4db2f..8b6e5c6638 100644
--- a/vendor/github.com/klauspost/compress/huff0/README.md
+++ b/vendor/github.com/klauspost/compress/huff0/README.md
@@ -14,7 +14,9 @@ but it can be used as a secondary step to compressors (like Snappy) that does no
 
 ## News
 
- * Mar 2018: First implementation released. Consider this beta software for now.
+This is used as part of the [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and decompression package.
+
+This ensures that most functionality is well tested.
 
 # Usage
 
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index a4979e8868..e36d9742f9 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -8,115 +8,10 @@ package huff0
 import (
 	"encoding/binary"
 	"errors"
+	"fmt"
 	"io"
 )
 
-// bitReader reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
-type bitReader struct {
-	in       []byte
-	off      uint // next byte to read is at in[off - 1]
-	value    uint64
-	bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReader) init(in []byte) error {
-	if len(in) < 1 {
-		return errors.New("corrupt stream: too short")
-	}
-	b.in = in
-	b.off = uint(len(in))
-	// The highest bit of the last byte indicates where to start
-	v := in[len(in)-1]
-	if v == 0 {
-		return errors.New("corrupt stream, did not find end of stream")
-	}
-	b.bitsRead = 64
-	b.value = 0
-	if len(in) >= 8 {
-		b.fillFastStart()
-	} else {
-		b.fill()
-		b.fill()
-	}
-	b.bitsRead += 8 - uint8(highBit32(uint32(v)))
-	return nil
-}
-
-// peekBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) peekBitsFast(n uint8) uint16 {
-	const regMask = 64 - 1
-	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
-	return v
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReader) fillFast() {
-	if b.bitsRead < 32 {
-		return
-	}
-
-	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	v = v[:4]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	b.value = (b.value << 32) | uint64(low)
-	b.bitsRead -= 32
-	b.off -= 4
-}
-
-func (b *bitReader) advance(n uint8) {
-	b.bitsRead += n
-}
-
-// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
-func (b *bitReader) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
-	b.bitsRead = 0
-	b.off -= 8
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReader) fill() {
-	if b.bitsRead < 32 {
-		return
-	}
-	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-		b.value = (b.value << 32) | uint64(low)
-		b.bitsRead -= 32
-		b.off -= 4
-		return
-	}
-	for b.off > 0 {
-		b.value = (b.value << 8) | uint64(b.in[b.off-1])
-		b.bitsRead -= 8
-		b.off--
-	}
-}
-
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReader) close() error {
-	// Release reference.
-	b.in = nil
-	if b.bitsRead > 64 {
-		return io.ErrUnexpectedEOF
-	}
-	return nil
-}
-
 // bitReader reads a bitstream in reverse.
 // The last set bit indicates the start of the stream and is used
 // for aligning the input.
@@ -172,7 +67,6 @@ func (b *bitReaderBytes) fillFast() {
 
 	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
-	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value |= uint64(low) << (b.bitsRead - 32)
 	b.bitsRead -= 32
@@ -193,8 +87,7 @@ func (b *bitReaderBytes) fill() {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
+		v := b.in[b.off-4 : b.off]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value |= uint64(low) << (b.bitsRead - 32)
 		b.bitsRead -= 32
@@ -213,10 +106,17 @@ func (b *bitReaderBytes) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 
+func (b *bitReaderBytes) remaining() uint {
+	return b.off*8 + uint(64-b.bitsRead)
+}
+
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderBytes) close() error {
 	// Release reference.
 	b.in = nil
+	if b.remaining() > 0 {
+		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
@@ -277,7 +177,6 @@ func (b *bitReaderShifted) fillFast() {
 
 	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
-	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 	b.bitsRead -= 32
@@ -298,8 +197,7 @@ func (b *bitReaderShifted) fill() {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
+		v := b.in[b.off-4 : b.off]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 		b.bitsRead -= 32
@@ -313,15 +211,17 @@ func (b *bitReaderShifted) fill() {
 	}
 }
 
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReaderShifted) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
+func (b *bitReaderShifted) remaining() uint {
+	return b.off*8 + uint(64-b.bitsRead)
 }
 
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderShifted) close() error {
 	// Release reference.
 	b.in = nil
+	if b.remaining() > 0 {
+		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
index 6bce4e87d4..0ebc9aaac7 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@@ -5,8 +5,6 @@
 
 package huff0
 
-import "fmt"
-
 // bitWriter will write bits.
 // First bit will be LSB of the first byte of output.
 type bitWriter struct {
@@ -15,22 +13,6 @@ type bitWriter struct {
 	out          []byte
 }
 
-// bitMask16 is bitmasks. Has extra to avoid bounds check.
-var bitMask16 = [32]uint16{
-	0, 1, 3, 7, 0xF, 0x1F,
-	0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
-	0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0xFFFF,
-	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
-	0xFFFF, 0xFFFF} /* up to 16 bits */
-
-// addBits16NC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
-	b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
-	b.nBits += bits
-}
-
 // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
 // It will not check if there is space for them, so the caller must ensure that it has flushed recently.
 func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@@ -70,102 +52,20 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
 	b.nBits += encA.nBits + encB.nBits
 }
 
-// addBits16ZeroNC will add up to 16 bits.
+// encFourSymbols adds up to 32 bits from four symbols.
 // It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-// This is fastest if bits can be zero.
-func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
-	if bits == 0 {
-		return
-	}
-	value <<= (16 - bits) & 15
-	value >>= (16 - bits) & 15
-	b.bitContainer |= uint64(value) << (b.nBits & 63)
-	b.nBits += bits
-}
-
-// flush will flush all pending full bytes.
-// There will be at least 56 bits available for writing when this has been called.
-// Using flush32 is faster, but leaves less space for writing.
-func (b *bitWriter) flush() {
-	v := b.nBits >> 3
-	switch v {
-	case 0:
-		return
-	case 1:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-		)
-		b.bitContainer >>= 1 << 3
-	case 2:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-		)
-		b.bitContainer >>= 2 << 3
-	case 3:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-		)
-		b.bitContainer >>= 3 << 3
-	case 4:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-		)
-		b.bitContainer >>= 4 << 3
-	case 5:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-		)
-		b.bitContainer >>= 5 << 3
-	case 6:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-		)
-		b.bitContainer >>= 6 << 3
-	case 7:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-			byte(b.bitContainer>>48),
-		)
-		b.bitContainer >>= 7 << 3
-	case 8:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-			byte(b.bitContainer>>48),
-			byte(b.bitContainer>>56),
-		)
-		b.bitContainer = 0
-		b.nBits = 0
-		return
-	default:
-		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
-	}
-	b.nBits &= 7
+// so the caller must ensure that b has been flushed recently.
+func (b *bitWriter) encFourSymbols(encA, encB, encC, encD cTableEntry) {
+	bitsA := encA.nBits
+	bitsB := bitsA + encB.nBits
+	bitsC := bitsB + encC.nBits
+	bitsD := bitsC + encD.nBits
+	combined := uint64(encA.val) |
+		(uint64(encB.val) << (bitsA & 63)) |
+		(uint64(encC.val) << (bitsB & 63)) |
+		(uint64(encD.val) << (bitsC & 63))
+	b.bitContainer |= combined << (b.nBits & 63)
+	b.nBits += bitsD
 }
 
 // flush32 will flush out, so there are at least 32 bits available for writing.
@@ -194,17 +94,9 @@ func (b *bitWriter) flushAlign() {
 
 // close will write the alignment bit and write the final byte(s)
 // to the output.
-func (b *bitWriter) close() error {
+func (b *bitWriter) close() {
 	// End mark
 	b.addBits16Clean(1, 1)
 	// flush until next byte.
 	b.flushAlign()
-	return nil
-}
-
-// reset and continue writing by appending to out.
-func (b *bitWriter) reset(out []byte) {
-	b.bitContainer = 0
-	b.nBits = 0
-	b.out = out
 }
diff --git a/vendor/github.com/klauspost/compress/huff0/bytereader.go b/vendor/github.com/klauspost/compress/huff0/bytereader.go
deleted file mode 100644
index 50bcdf6ea9..0000000000
--- a/vendor/github.com/klauspost/compress/huff0/bytereader.go
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package huff0
-
-// byteReader provides a byte reader that reads
-// little endian values from a byte stream.
-// The input stream is manually advanced.
-// The reader performs no bounds checks.
-type byteReader struct {
-	b   []byte
-	off int
-}
-
-// init will initialize the reader and set the input.
-func (b *byteReader) init(in []byte) {
-	b.b = in
-	b.off = 0
-}
-
-// advance the stream b n bytes.
-func (b *byteReader) advance(n uint) {
-	b.off += int(n)
-}
-
-// Int32 returns a little endian int32 starting at current offset.
-func (b byteReader) Int32() int32 {
-	v3 := int32(b.b[b.off+3])
-	v2 := int32(b.b[b.off+2])
-	v1 := int32(b.b[b.off+1])
-	v0 := int32(b.b[b.off])
-	return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
-}
-
-// Uint32 returns a little endian uint32 starting at current offset.
-func (b byteReader) Uint32() uint32 {
-	v3 := uint32(b.b[b.off+3])
-	v2 := uint32(b.b[b.off+2])
-	v1 := uint32(b.b[b.off+1])
-	v0 := uint32(b.b[b.off])
-	return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
-}
-
-// unread returns the unread portion of the input.
-func (b byteReader) unread() []byte {
-	return b.b[b.off:]
-}
-
-// remain will return the number of bytes remaining.
-func (b byteReader) remain() int {
-	return len(b.b) - b.off
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index f9ed5f8306..84aa3d12f0 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -2,6 +2,7 @@ package huff0
 
 import (
 	"fmt"
+	"math"
 	"runtime"
 	"sync"
 )
@@ -161,11 +162,75 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
 	return s.Out, false, nil
 }
 
+// EstimateSizes will estimate the data sizes
+func EstimateSizes(in []byte, s *Scratch) (tableSz, dataSz, reuseSz int, err error) {
+	s, err = s.prepare(in)
+	if err != nil {
+		return 0, 0, 0, err
+	}
+
+	// Create histogram, if none was provided.
+	tableSz, dataSz, reuseSz = -1, -1, -1
+	maxCount := s.maxCount
+	var canReuse = false
+	if maxCount == 0 {
+		maxCount, canReuse = s.countSimple(in)
+	} else {
+		canReuse = s.canUseTable(s.prevTable)
+	}
+
+	// We want the output size to be less than this:
+	wantSize := len(in)
+	if s.WantLogLess > 0 {
+		wantSize -= wantSize >> s.WantLogLess
+	}
+
+	// Reset for next run.
+	s.clearCount = true
+	s.maxCount = 0
+	if maxCount >= len(in) {
+		if maxCount > len(in) {
+			return 0, 0, 0, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
+		}
+		if len(in) == 1 {
+			return 0, 0, 0, ErrIncompressible
+		}
+		// One symbol, use RLE
+		return 0, 0, 0, ErrUseRLE
+	}
+	if maxCount == 1 || maxCount < (len(in)>>7) {
+		// Each symbol present maximum once or too well distributed.
+		return 0, 0, 0, ErrIncompressible
+	}
+
+	// Calculate new table.
+	err = s.buildCTable()
+	if err != nil {
+		return 0, 0, 0, err
+	}
+
+	if false && !s.canUseTable(s.cTable) {
+		panic("invalid table generated")
+	}
+
+	tableSz, err = s.cTable.estTableSize(s)
+	if err != nil {
+		return 0, 0, 0, err
+	}
+	if canReuse {
+		reuseSz = s.prevTable.estimateSize(s.count[:s.symbolLen])
+	}
+	dataSz = s.cTable.estimateSize(s.count[:s.symbolLen])
+
+	// Restore
+	return tableSz, dataSz, reuseSz, nil
+}
+
 func (s *Scratch) compress1X(src []byte) ([]byte, error) {
-	return s.compress1xDo(s.Out, src)
+	return s.compress1xDo(s.Out, src), nil
 }
 
-func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
+func (s *Scratch) compress1xDo(dst, src []byte) []byte {
 	var bw = bitWriter{out: dst}
 
 	// N is length divisible by 4.
@@ -183,8 +248,7 @@ func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
 			tmp := src[n : n+4]
 			// tmp should be len 4
 			bw.flush32()
-			bw.encTwoSymbols(cTable, tmp[3], tmp[2])
-			bw.encTwoSymbols(cTable, tmp[1], tmp[0])
+			bw.encFourSymbols(cTable[tmp[3]], cTable[tmp[2]], cTable[tmp[1]], cTable[tmp[0]])
 		}
 	} else {
 		for ; n >= 0; n -= 4 {
@@ -196,8 +260,8 @@ func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
 			bw.encTwoSymbols(cTable, tmp[1], tmp[0])
 		}
 	}
-	err := bw.close()
-	return bw.out, err
+	bw.close()
+	return bw.out
 }
 
 var sixZeros [6]byte
@@ -219,11 +283,11 @@ func (s *Scratch) compress4X(src []byte) ([]byte, error) {
 		}
 		src = src[len(toDo):]
 
-		var err error
 		idx := len(s.Out)
-		s.Out, err = s.compress1xDo(s.Out, toDo)
-		if err != nil {
-			return nil, err
+		s.Out = s.compress1xDo(s.Out, toDo)
+		if len(s.Out)-idx > math.MaxUint16 {
+			// We cannot store the size in the jump table
+			return nil, ErrIncompressible
 		}
 		// Write compressed length as little endian before block.
 		if i < 3 {
@@ -247,7 +311,6 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
 
 	segmentSize := (len(src) + 3) / 4
 	var wg sync.WaitGroup
-	var errs [4]error
 	wg.Add(4)
 	for i := 0; i < 4; i++ {
 		toDo := src
@@ -258,16 +321,17 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
 
 		// Separate goroutine for each block.
 		go func(i int) {
-			s.tmpOut[i], errs[i] = s.compress1xDo(s.tmpOut[i][:0], toDo)
+			s.tmpOut[i] = s.compress1xDo(s.tmpOut[i][:0], toDo)
 			wg.Done()
 		}(i)
 	}
 	wg.Wait()
 	for i := 0; i < 4; i++ {
-		if errs[i] != nil {
-			return nil, errs[i]
-		}
 		o := s.tmpOut[i]
+		if len(o) > math.MaxUint16 {
+			// We cannot store the size in the jump table
+			return nil, ErrIncompressible
+		}
 		// Write compressed length as little endian before block.
 		if i < 3 {
 			// Last length is not written.
@@ -286,35 +350,36 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
 // Does not update s.clearCount.
 func (s *Scratch) countSimple(in []byte) (max int, reuse bool) {
 	reuse = true
+	_ = s.count // Assert that s != nil to speed up the following loop.
 	for _, v := range in {
 		s.count[v]++
 	}
 	m := uint32(0)
 	if len(s.prevTable) > 0 {
 		for i, v := range s.count[:] {
+			if v == 0 {
+				continue
+			}
 			if v > m {
 				m = v
 			}
-			if v > 0 {
-				s.symbolLen = uint16(i) + 1
-				if i >= len(s.prevTable) {
-					reuse = false
-				} else {
-					if s.prevTable[i].nBits == 0 {
-						reuse = false
-					}
-				}
+			s.symbolLen = uint16(i) + 1
+			if i >= len(s.prevTable) {
+				reuse = false
+			} else if s.prevTable[i].nBits == 0 {
+				reuse = false
 			}
 		}
 		return int(m), reuse
 	}
 	for i, v := range s.count[:] {
+		if v == 0 {
+			continue
+		}
 		if v > m {
 			m = v
 		}
-		if v > 0 {
-			s.symbolLen = uint16(i) + 1
-		}
+		s.symbolLen = uint16(i) + 1
 	}
 	return int(m), false
 }
@@ -331,6 +396,7 @@ func (s *Scratch) canUseTable(c cTable) bool {
 	return true
 }
 
+//lint:ignore U1000 used for debugging
 func (s *Scratch) validateTable(c cTable) bool {
 	if len(c) < int(s.symbolLen) {
 		return false
@@ -350,7 +416,7 @@ func (s *Scratch) validateTable(c cTable) bool {
 
 // minTableLog provides the minimum logSize to safely represent a distribution.
 func (s *Scratch) minTableLog() uint8 {
-	minBitsSrc := highBit32(uint32(s.br.remain())) + 1
+	minBitsSrc := highBit32(uint32(s.srcLen)) + 1
 	minBitsSymbols := highBit32(uint32(s.symbolLen-1)) + 2
 	if minBitsSrc < minBitsSymbols {
 		return uint8(minBitsSrc)
@@ -362,7 +428,7 @@ func (s *Scratch) minTableLog() uint8 {
 func (s *Scratch) optimalTableLog() {
 	tableLog := s.TableLog
 	minBits := s.minTableLog()
-	maxBitsSrc := uint8(highBit32(uint32(s.br.remain()-1))) - 1
+	maxBitsSrc := uint8(highBit32(uint32(s.srcLen-1))) - 1
 	if maxBitsSrc < tableLog {
 		// Accuracy can be reduced
 		tableLog = maxBitsSrc
@@ -403,41 +469,42 @@ func (s *Scratch) buildCTable() error {
 	var startNode = int16(s.symbolLen)
 	nonNullRank := s.symbolLen - 1
 
-	nodeNb := int16(startNode)
+	nodeNb := startNode
 	huffNode := s.nodes[1 : huffNodesLen+1]
 
 	// This overlays the slice above, but allows "-1" index lookups.
 	// Different from reference implementation.
 	huffNode0 := s.nodes[0 : huffNodesLen+1]
 
-	for huffNode[nonNullRank].count == 0 {
+	for huffNode[nonNullRank].count() == 0 {
 		nonNullRank--
 	}
 
 	lowS := int16(nonNullRank)
 	nodeRoot := nodeNb + lowS - 1
 	lowN := nodeNb
-	huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count
-	huffNode[lowS].parent, huffNode[lowS-1].parent = uint16(nodeNb), uint16(nodeNb)
+	huffNode[nodeNb].setCount(huffNode[lowS].count() + huffNode[lowS-1].count())
+	huffNode[lowS].setParent(nodeNb)
+	huffNode[lowS-1].setParent(nodeNb)
 	nodeNb++
 	lowS -= 2
 	for n := nodeNb; n <= nodeRoot; n++ {
-		huffNode[n].count = 1 << 30
+		huffNode[n].setCount(1 << 30)
 	}
 	// fake entry, strong barrier
-	huffNode0[0].count = 1 << 31
+	huffNode0[0].setCount(1 << 31)
 
 	// create parents
 	for nodeNb <= nodeRoot {
 		var n1, n2 int16
-		if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+		if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
 			n1 = lowS
 			lowS--
 		} else {
 			n1 = lowN
 			lowN++
 		}
-		if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+		if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
 			n2 = lowS
 			lowS--
 		} else {
@@ -445,18 +512,19 @@ func (s *Scratch) buildCTable() error {
 			lowN++
 		}
 
-		huffNode[nodeNb].count = huffNode0[n1+1].count + huffNode0[n2+1].count
-		huffNode0[n1+1].parent, huffNode0[n2+1].parent = uint16(nodeNb), uint16(nodeNb)
+		huffNode[nodeNb].setCount(huffNode0[n1+1].count() + huffNode0[n2+1].count())
+		huffNode0[n1+1].setParent(nodeNb)
+		huffNode0[n2+1].setParent(nodeNb)
 		nodeNb++
 	}
 
 	// distribute weights (unlimited tree height)
-	huffNode[nodeRoot].nbBits = 0
+	huffNode[nodeRoot].setNbBits(0)
 	for n := nodeRoot - 1; n >= startNode; n-- {
-		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+		huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
 	}
 	for n := uint16(0); n <= nonNullRank; n++ {
-		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+		huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
 	}
 	s.actualTableLog = s.setMaxHeight(int(nonNullRank))
 	maxNbBits := s.actualTableLog
@@ -468,7 +536,7 @@ func (s *Scratch) buildCTable() error {
 	var nbPerRank [tableLogMax + 1]uint16
 	var valPerRank [16]uint16
 	for _, v := range huffNode[:nonNullRank+1] {
-		nbPerRank[v.nbBits]++
+		nbPerRank[v.nbBits()]++
 	}
 	// determine stating value per rank
 	{
@@ -483,7 +551,7 @@ func (s *Scratch) buildCTable() error {
 
 	// push nbBits per symbol, symbol order
 	for _, v := range huffNode[:nonNullRank+1] {
-		s.cTable[v.symbol].nBits = v.nbBits
+		s.cTable[v.symbol()].nBits = v.nbBits()
 	}
 
 	// assign value within rank, symbol order
@@ -529,14 +597,13 @@ func (s *Scratch) huffSort() {
 		pos := rank[r].current
 		rank[r].current++
 		prev := nodes[(pos-1)&huffNodesMask]
-		for pos > rank[r].base && c > prev.count {
+		for pos > rank[r].base && c > prev.count() {
 			nodes[pos&huffNodesMask] = prev
 			pos--
 			prev = nodes[(pos-1)&huffNodesMask]
 		}
-		nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
+		nodes[pos&huffNodesMask] = makeNodeElt(c, byte(n))
 	}
-	return
 }
 
 func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
@@ -544,7 +611,7 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 	huffNode := s.nodes[1 : huffNodesLen+1]
 	//huffNode = huffNode[: huffNodesLen]
 
-	largestBits := huffNode[lastNonNull].nbBits
+	largestBits := huffNode[lastNonNull].nbBits()
 
 	// early exit : no elt > maxNbBits
 	if largestBits <= maxNbBits {
@@ -554,14 +621,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 	baseCost := int(1) << (largestBits - maxNbBits)
 	n := uint32(lastNonNull)
 
-	for huffNode[n].nbBits > maxNbBits {
-		totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits))
-		huffNode[n].nbBits = maxNbBits
+	for huffNode[n].nbBits() > maxNbBits {
+		totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits()))
+		huffNode[n].setNbBits(maxNbBits)
 		n--
 	}
 	// n stops at huffNode[n].nbBits <= maxNbBits
 
-	for huffNode[n].nbBits == maxNbBits {
+	for huffNode[n].nbBits() == maxNbBits {
 		n--
 	}
 	// n end at index of smallest symbol using < maxNbBits
@@ -580,12 +647,12 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 
 		// Get pos of last (smallest) symbol per rank
 		{
-			currentNbBits := uint8(maxNbBits)
+			currentNbBits := maxNbBits
 			for pos := int(n); pos >= 0; pos-- {
-				if huffNode[pos].nbBits >= currentNbBits {
+				if huffNode[pos].nbBits() >= currentNbBits {
 					continue
 				}
-				currentNbBits = huffNode[pos].nbBits // < maxNbBits
+				currentNbBits = huffNode[pos].nbBits() // < maxNbBits
 				rankLast[maxNbBits-currentNbBits] = uint32(pos)
 			}
 		}
@@ -602,8 +669,8 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 				if lowPos == noSymbol {
 					break
 				}
-				highTotal := huffNode[highPos].count
-				lowTotal := 2 * huffNode[lowPos].count
+				highTotal := huffNode[highPos].count()
+				lowTotal := 2 * huffNode[lowPos].count()
 				if highTotal <= lowTotal {
 					break
 				}
@@ -619,13 +686,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 				// this rank is no longer empty
 				rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]
 			}
-			huffNode[rankLast[nBitsToDecrease]].nbBits++
+			huffNode[rankLast[nBitsToDecrease]].setNbBits(1 +
+				huffNode[rankLast[nBitsToDecrease]].nbBits())
 			if rankLast[nBitsToDecrease] == 0 {
 				/* special case, reached largest symbol */
 				rankLast[nBitsToDecrease] = noSymbol
 			} else {
 				rankLast[nBitsToDecrease]--
-				if huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease {
+				if huffNode[rankLast[nBitsToDecrease]].nbBits() != maxNbBits-nBitsToDecrease {
 					rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */
 				}
 			}
@@ -633,15 +701,15 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 
 		for totalCost < 0 { /* Sometimes, cost correction overshoot */
 			if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
-				for huffNode[n].nbBits == maxNbBits {
+				for huffNode[n].nbBits() == maxNbBits {
 					n--
 				}
-				huffNode[n+1].nbBits--
+				huffNode[n+1].setNbBits(huffNode[n+1].nbBits() - 1)
 				rankLast[1] = n + 1
 				totalCost++
 				continue
 			}
-			huffNode[rankLast[1]+1].nbBits--
+			huffNode[rankLast[1]+1].setNbBits(huffNode[rankLast[1]+1].nbBits() - 1)
 			rankLast[1]++
 			totalCost++
 		}
@@ -649,9 +717,26 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 	return maxNbBits
 }
 
-type nodeElt struct {
-	count  uint32
-	parent uint16
-	symbol byte
-	nbBits uint8
+// A nodeElt is the fields
+//
+//	count  uint32
+//	parent uint16
+//	symbol byte
+//	nbBits uint8
+//
+// in some order, all squashed into an integer so that the compiler
+// always loads and stores entire nodeElts instead of separate fields.
+type nodeElt uint64
+
+func makeNodeElt(count uint32, symbol byte) nodeElt {
+	return nodeElt(count) | nodeElt(symbol)<<48
 }
+
+func (e *nodeElt) count() uint32  { return uint32(*e) }
+func (e *nodeElt) parent() uint16 { return uint16(*e >> 32) }
+func (e *nodeElt) symbol() byte   { return byte(*e >> 48) }
+func (e *nodeElt) nbBits() uint8  { return uint8(*e >> 56) }
+
+func (e *nodeElt) setCount(c uint32) { *e = (*e)&0xffffffff00000000 | nodeElt(c) }
+func (e *nodeElt) setParent(p int16) { *e = (*e)&0xffff0000ffffffff | nodeElt(uint16(p))<<32 }
+func (e *nodeElt) setNbBits(n uint8) { *e = (*e)&0x00ffffffffffffff | nodeElt(n)<<56 }
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 41703bba4d..0f56b02d74 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -4,13 +4,13 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"sync"
 
 	"github.com/klauspost/compress/fse"
 )
 
 type dTable struct {
 	single []dEntrySingle
-	double []dEntryDouble
 }
 
 // single-symbols decoding
@@ -18,13 +18,6 @@ type dEntrySingle struct {
 	entry uint16
 }
 
-// double-symbols decoding
-type dEntryDouble struct {
-	seq   uint16
-	nBits uint8
-	len   uint8
-}
-
 // Uses special code for all tables that are < 8 bits.
 const use8BitTables = true
 
@@ -34,7 +27,7 @@ const use8BitTables = true
 // If no Scratch is provided a new one is allocated.
 // The returned Scratch can be used for encoding or decoding input using this table.
 func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
-	s, err = s.prepare(in)
+	s, err = s.prepare(nil)
 	if err != nil {
 		return s, nil, err
 	}
@@ -68,7 +61,7 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 		b, err := fse.Decompress(in[:iSize], s.fse)
 		s.fse.Out = nil
 		if err != nil {
-			return s, nil, err
+			return s, nil, fmt.Errorf("fse decompress returned: %w", err)
 		}
 		if len(b) > 255 {
 			return s, nil, errors.New("corrupt input: output table too large")
@@ -216,6 +209,7 @@ func (s *Scratch) Decoder() *Decoder {
 	return &Decoder{
 		dt:             s.dt,
 		actualTableLog: s.actualTableLog,
+		bufs:           &s.decPool,
 	}
 }
 
@@ -223,103 +217,15 @@ func (s *Scratch) Decoder() *Decoder {
 type Decoder struct {
 	dt             dTable
 	actualTableLog uint8
+	bufs           *sync.Pool
 }
 
-// Decompress1X will decompress a 1X encoded stream.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress1X8Bit(dst, src)
-	}
-	var br bitReaderShifted
-	err := br.init(src)
-	if err != nil {
-		return dst, err
-	}
-	maxDecodedSize := cap(dst)
-	dst = dst[:0]
-
-	// Avoid bounds check by always having full sized table.
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	dt := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
-	var off uint8
-
-	for br.off >= 8 {
-		br.fillFast()
-		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+0] = uint8(v.entry >> 8)
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+1] = uint8(v.entry >> 8)
-
-		// Refill
-		br.fillFast()
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+2] = uint8(v.entry >> 8)
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+3] = uint8(v.entry >> 8)
-
-		off += 4
-		if off == 0 {
-			if len(dst)+256 > maxDecodedSize {
-				br.close()
-				return nil, ErrMaxDecodedSizeExceeded
-			}
-			dst = append(dst, buf[:]...)
-		}
-	}
-
-	if len(dst)+int(off) > maxDecodedSize {
-		br.close()
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	dst = append(dst, buf[:off]...)
-
-	// br < 8, so uint8 is fine
-	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
-	for bitsLeft > 0 {
-		br.fill()
-		if false && br.bitsRead >= 32 {
-			if br.off >= 4 {
-				v := br.in[br.off-4:]
-				v = v[:4]
-				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-				br.value = (br.value << 32) | uint64(low)
-				br.bitsRead -= 32
-				br.off -= 4
-			} else {
-				for br.off > 0 {
-					br.value = (br.value << 8) | uint64(br.in[br.off-1])
-					br.bitsRead -= 8
-					br.off--
-				}
-			}
-		}
-		if len(dst) >= maxDecodedSize {
-			br.close()
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
-		nBits := uint8(v.entry)
-		br.advance(nBits)
-		bitsLeft -= nBits
-		dst = append(dst, uint8(v.entry>>8))
+func (d *Decoder) buffer() *[4][256]byte {
+	buf, ok := d.bufs.Get().(*[4][256]byte)
+	if ok {
+		return buf
 	}
-	return dst, br.close()
+	return &[4][256]byte{}
 }
 
 // decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
@@ -341,41 +247,258 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 	dt := d.dt.single[:256]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
-	shift := (8 - d.actualTableLog) & 7
-
-	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
-	for br.off >= 4 {
-		br.fillFast()
-		v := dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+0] = uint8(v.entry >> 8)
-
-		v = dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+1] = uint8(v.entry >> 8)
-
-		v = dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+2] = uint8(v.entry >> 8)
-
-		v = dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+3] = uint8(v.entry >> 8)
-
-		off += 4
-		if off == 0 {
-			if len(dst)+256 > maxDecodedSize {
-				br.close()
-				return nil, ErrMaxDecodedSizeExceeded
+	switch d.actualTableLog {
+	case 8:
+		const shift = 0
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					br.close()
+					d.bufs.Put(bufs)
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 7:
+		const shift = 8 - 7
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					br.close()
+					d.bufs.Put(bufs)
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 6:
+		const shift = 8 - 6
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 5:
+		const shift = 8 - 5
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 4:
+		const shift = 8 - 4
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 3:
+		const shift = 8 - 3
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
 			}
-			dst = append(dst, buf[:]...)
 		}
+	case 2:
+		const shift = 8 - 2
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 1:
+		const shift = 8 - 1
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	default:
+		d.bufs.Put(bufs)
+		return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog)
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -383,6 +506,8 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 
 	// br < 4, so uint8 is fine
 	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+	shift := (8 - d.actualTableLog) & 7
+
 	for bitsLeft > 0 {
 		if br.bitsRead >= 64-8 {
 			for br.off > 0 {
@@ -393,6 +518,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
+			d.bufs.Put(bufs)
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := dt[br.peekByteFast()>>shift]
@@ -401,6 +527,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
@@ -420,33 +547,35 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 	dt := d.dt.single[:256]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
-	const shift = 0
+	const shift = 56
 
 	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
 	for br.off >= 4 {
 		br.fillFast()
-		v := dt[br.peekByteFast()>>shift]
+		v := dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+0] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
+		v = dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+1] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
+		v = dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+2] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
+		v = dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+3] = uint8(v.entry >> 8)
 
 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
+				d.bufs.Put(bufs)
 				br.close()
 				return nil, ErrMaxDecodedSizeExceeded
 			}
@@ -455,6 +584,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -471,208 +601,20 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 			}
 		}
 		if len(dst) >= maxDecodedSize {
+			d.bufs.Put(bufs)
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
-		v := dt[br.peekByteFast()>>shift]
+		v := dt[br.peekByteFast()]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(src) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress4X8bit(dst, src)
-	}
-
-	var br [4]bitReaderShifted
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
-	var off uint8
-	var decoded int
-
-	// Decode 2 values from each decoder/loop.
-	const bufoff = 256 / 4
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			const stream = 0
-			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v2 := single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v2 = single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v2 := single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v2 = single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
-		}
-
-		off += 2
-
-		if off == bufoff {
-			if bufoff > dstEvery {
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
-			out = out[bufoff:]
-			decoded += 256
-			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	for i := range br {
-		offset := dstEvery * i
-		br := &br[i]
-		bitsLeft := br.off*8 + uint(64-br.bitsRead)
-		for bitsLeft > 0 {
-			br.fill()
-			if false && br.bitsRead >= 32 {
-				if br.off >= 4 {
-					v := br.in[br.off-4:]
-					v = v[:4]
-					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-					br.value = (br.value << 32) | uint64(low)
-					br.bitsRead -= 32
-					br.off -= 4
-				} else {
-					for br.off > 0 {
-						br.value = (br.value << 8) | uint64(br.in[br.off-1])
-						br.bitsRead -= 8
-						br.off--
-					}
-				}
-			}
-			// end inline...
-			if offset >= len(out) {
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			val := br.peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			return nil, err
-		}
-	}
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
@@ -706,19 +648,18 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 	out := dst
 	dstEvery := (dstSize + 3) / 4
 
-	shift := (8 - d.actualTableLog) & 7
+	shift := (56 + (8 - d.actualTableLog)) & 63
 
 	const tlSize = 1 << 8
-	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	buf := d.buffer()
 	var off uint8
 	var decoded int
 
 	// Decode 4 values from each decoder/loop.
-	const bufoff = 256 / 4
+	const bufoff = 256
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
@@ -728,120 +669,144 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		{
 			const stream = 2
 			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		off += 4
 
-		if off == bufoff {
+		if off == 0 {
 			if bufoff > dstEvery {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
-			out = out[bufoff:]
-			decoded += 256
 			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
+			if len(out)-bufoff < dstEvery*3 {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
+			//copy(out, buf[0][:])
+			//copy(out[dstEvery:], buf[1][:])
+			//copy(out[dstEvery*2:], buf[2][:])
+			*(*[bufoff]byte)(out) = buf[0]
+			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+			out = out[bufoff:]
+			decoded += bufoff * 4
 		}
 	}
 	if off > 0 {
 		ioff := int(off)
 		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 
 	// Decode remaining.
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
 	for i := range br {
 		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
 		br := &br[i]
-		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		bitsLeft := br.remaining()
 		for bitsLeft > 0 {
 			if br.finished() {
+				d.bufs.Put(buf)
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
@@ -861,24 +826,31 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 				}
 			}
 			// end inline...
-			if offset >= len(out) {
+			if offset >= endsAt {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
 			// Read value and increment offset.
-			v := single[br.peekByteFast()>>shift].entry
+			v := single[uint8(br.value>>shift)].entry
 			nBits := uint8(v)
 			br.advance(nBits)
-			bitsLeft -= int(nBits)
+			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
+			d.bufs.Put(buf)
 			return nil, err
 		}
 	}
+	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
@@ -914,18 +886,17 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 	out := dst
 	dstEvery := (dstSize + 3) / 4
 
-	const shift = 0
+	const shift = 56
 	const tlSize = 1 << 8
-	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	buf := d.buffer()
 	var off uint8
 	var decoded int
 
 	// Decode 4 values from each decoder/loop.
-	const bufoff = 256 / 4
+	const bufoff = 256
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
@@ -935,98 +906,116 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		{
 			const stream = 2
 			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		off += 4
 
-		if off == bufoff {
+		if off == 0 {
 			if bufoff > dstEvery {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
-			out = out[bufoff:]
-			decoded += 256
 			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
+			if len(out)-bufoff < dstEvery*3 {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
+
+			//copy(out, buf[0][:])
+			//copy(out[dstEvery:], buf[1][:])
+			//copy(out[dstEvery*2:], buf[2][:])
+			// copy(out[dstEvery*3:], buf[3][:])
+			*(*[bufoff]byte)(out) = buf[0]
+			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+			out = out[bufoff:]
+			decoded += bufoff * 4
 		}
 	}
 	if off > 0 {
@@ -1034,21 +1023,27 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 
 	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
 	for i := range br {
 		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
 		br := &br[i]
-		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		bitsLeft := br.remaining()
 		for bitsLeft > 0 {
 			if br.finished() {
+				d.bufs.Put(buf)
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
@@ -1068,24 +1063,32 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 				}
 			}
 			// end inline...
-			if offset >= len(out) {
+			if offset >= endsAt {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
 			// Read value and increment offset.
-			v := single[br.peekByteFast()>>shift].entry
+			v := single[br.peekByteFast()].entry
 			nBits := uint8(v)
 			br.advance(nBits)
-			bitsLeft -= int(nBits)
+			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
+			d.bufs.Put(buf)
 			return nil, err
 		}
 	}
+	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
@@ -1133,7 +1136,7 @@ func (s *Scratch) matches(ct cTable, w io.Writer) {
 			errs++
 		}
 		if errs > 0 {
-			fmt.Fprintf(w, "%d errros in base, stopping\n", errs)
+			fmt.Fprintf(w, "%d errors in base, stopping\n", errs)
 			continue
 		}
 		// Ensure that all combinations are covered.
@@ -1149,7 +1152,7 @@ func (s *Scratch) matches(ct cTable, w io.Writer) {
 				errs++
 			}
 			if errs > 20 {
-				fmt.Fprintf(w, "%d errros, stopping\n", errs)
+				fmt.Fprintf(w, "%d errors, stopping\n", errs)
 				break
 			}
 		}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
new file mode 100644
index 0000000000..ba7e8e6b02
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -0,0 +1,226 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// This file contains the specialisation of Decoder.Decompress4X
+// and Decoder.Decompress1X that use an asm implementation of thir main loops.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+
+	"github.com/klauspost/compress/internal/cpuinfo"
+)
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog > 8.
+//
+//go:noescape
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+
+// decompress4x_8b_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog <= 8 which decodes 4 entries
+// per loop.
+//
+//go:noescape
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+
+// fallback8BitSize is the size where using Go version is faster.
+const fallback8BitSize = 800
+
+type decompress4xContext struct {
+	pbr      *[4]bitReaderShifted
+	peekBits uint8
+	out      *byte
+	dstEvery int
+	tbl      *dEntrySingle
+	decoded  int
+	limit    *byte
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+
+	use8BitTables := d.actualTableLog <= 8
+	if cap(dst) < fallback8BitSize && use8BitTables {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	var decoded int
+
+	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
+		ctx := decompress4xContext{
+			pbr:      &br,
+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+			out:      &out[0],
+			dstEvery: dstEvery,
+			tbl:      &single[0],
+			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
+		}
+		if use8BitTables {
+			decompress4x_8b_main_loop_amd64(&ctx)
+		} else {
+			decompress4x_main_loop_amd64(&ctx)
+		}
+
+		decoded = ctx.decoded
+		out = out[decoded/4:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress1X when tablelog > 8.
+//
+//go:noescape
+func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+
+// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
+// of Decompress1X when tablelog > 8.
+//
+//go:noescape
+func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+
+type decompress1xContext struct {
+	pbr      *bitReaderShifted
+	peekBits uint8
+	out      *byte
+	outCap   int
+	tbl      *dEntrySingle
+	decoded  int
+}
+
+// Error reported by asm implementations
+const error_max_decoded_size_exeeded = -1
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	var br bitReaderShifted
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:maxDecodedSize]
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+
+	if maxDecodedSize >= 4 {
+		ctx := decompress1xContext{
+			pbr:      &br,
+			out:      &dst[0],
+			outCap:   maxDecodedSize,
+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+			tbl:      &d.dt.single[0],
+		}
+
+		if cpuinfo.HasBMI2() {
+			decompress1x_main_loop_bmi2(&ctx)
+		} else {
+			decompress1x_main_loop_amd64(&ctx)
+		}
+		if ctx.decoded == error_max_decoded_size_exeeded {
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+
+		dst = dst[:ctx.decoded]
+	}
+
+	// br < 8, so uint8 is fine
+	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+	for bitsLeft > 0 {
+		br.fill()
+		if len(dst) >= maxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= nBits
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	return dst, br.close()
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
new file mode 100644
index 0000000000..c4c7ab2d1f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -0,0 +1,830 @@
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
+
+//go:build amd64 && !appengine && !noasm && gc
+
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $0-8
+	// Preload values
+	MOVQ    ctx+0(FP), AX
+	MOVBQZX 8(AX), DI
+	MOVQ    16(AX), BX
+	MOVQ    48(AX), SI
+	MOVQ    24(AX), R8
+	MOVQ    32(AX), R9
+	MOVQ    (AX), R10
+
+	// Main loop
+main_loop:
+	XORL  DX, DX
+	CMPQ  BX, SI
+	SETGE DL
+
+	// br0.fillFast32()
+	MOVQ    32(R10), R11
+	MOVBQZX 40(R10), R12
+	CMPQ    R12, $0x20
+	JBE     skip_fill0
+	MOVQ    24(R10), AX
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, AX
+	MOVQ    (R10), R13
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 24(R10)
+	ORQ  R13, R11
+
+	// exhausted += (br0.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL
+
+skip_fill0:
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ DI, CX
+	MOVQ R11, R13
+	SHRQ CL, R13
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br0.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (BX)
+
+	// update the bitreader structure
+	MOVQ R11, 32(R10)
+	MOVB R12, 40(R10)
+
+	// br1.fillFast32()
+	MOVQ    80(R10), R11
+	MOVBQZX 88(R10), R12
+	CMPQ    R12, $0x20
+	JBE     skip_fill1
+	MOVQ    72(R10), AX
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, AX
+	MOVQ    48(R10), R13
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 72(R10)
+	ORQ  R13, R11
+
+	// exhausted += (br1.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL
+
+skip_fill1:
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ DI, CX
+	MOVQ R11, R13
+	SHRQ CL, R13
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br1.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (BX)(R8*1)
+
+	// update the bitreader structure
+	MOVQ R11, 80(R10)
+	MOVB R12, 88(R10)
+
+	// br2.fillFast32()
+	MOVQ    128(R10), R11
+	MOVBQZX 136(R10), R12
+	CMPQ    R12, $0x20
+	JBE     skip_fill2
+	MOVQ    120(R10), AX
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, AX
+	MOVQ    96(R10), R13
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 120(R10)
+	ORQ  R13, R11
+
+	// exhausted += (br2.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL
+
+skip_fill2:
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ DI, CX
+	MOVQ R11, R13
+	SHRQ CL, R13
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br2.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (BX)(R8*2)
+
+	// update the bitreader structure
+	MOVQ R11, 128(R10)
+	MOVB R12, 136(R10)
+
+	// br3.fillFast32()
+	MOVQ    176(R10), R11
+	MOVBQZX 184(R10), R12
+	CMPQ    R12, $0x20
+	JBE     skip_fill3
+	MOVQ    168(R10), AX
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, AX
+	MOVQ    144(R10), R13
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 168(R10)
+	ORQ  R13, R11
+
+	// exhausted += (br3.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL
+
+skip_fill3:
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ DI, CX
+	MOVQ R11, R13
+	SHRQ CL, R13
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br3.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	LEAQ (R8)(R8*2), CX
+	MOVW AX, (BX)(CX*1)
+
+	// update the bitreader structure
+	MOVQ  R11, 176(R10)
+	MOVB  R12, 184(R10)
+	ADDQ  $0x02, BX
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	SUBQ  16(AX), BX
+	SHLQ  $0x02, BX
+	MOVQ  BX, 40(AX)
+	RET
+
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
+	// Preload values
+	MOVQ    ctx+0(FP), CX
+	MOVBQZX 8(CX), DI
+	MOVQ    16(CX), BX
+	MOVQ    48(CX), SI
+	MOVQ    24(CX), R8
+	MOVQ    32(CX), R9
+	MOVQ    (CX), R10
+
+	// Main loop
+main_loop:
+	XORL  DX, DX
+	CMPQ  BX, SI
+	SETGE DL
+
+	// br0.fillFast32()
+	MOVQ    32(R10), R11
+	MOVBQZX 40(R10), R12
+	CMPQ    R12, $0x20
+	JBE     skip_fill0
+	MOVQ    24(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    (R10), R14
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 24(R10)
+	ORQ  R14, R11
+
+	// exhausted += (br0.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL
+
+skip_fill0:
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v1 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br0.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R11
+	ADDB   CL, R12
+	BSWAPL AX
+
+	// val2 := br0.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v2 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br0.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val3 := br0.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v3 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br0.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R11
+	ADDB   CL, R12
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (BX)
+
+	// update the bitreader structure
+	MOVQ R11, 32(R10)
+	MOVB R12, 40(R10)
+
+	// br1.fillFast32()
+	MOVQ    80(R10), R11
+	MOVBQZX 88(R10), R12
+	CMPQ    R12, $0x20
+	JBE     skip_fill1
+	MOVQ    72(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    48(R10), R14
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 72(R10)
+	ORQ  R14, R11
+
+	// exhausted += (br1.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL
+
+skip_fill1:
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v1 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br1.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R11
+	ADDB   CL, R12
+	BSWAPL AX
+
+	// val2 := br1.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v2 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br1.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val3 := br1.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v3 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br1.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R11
+	ADDB   CL, R12
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (BX)(R8*1)
+
+	// update the bitreader structure
+	MOVQ R11, 80(R10)
+	MOVB R12, 88(R10)
+
+	// br2.fillFast32()
+	MOVQ    128(R10), R11
+	MOVBQZX 136(R10), R12
+	CMPQ    R12, $0x20
+	JBE     skip_fill2
+	MOVQ    120(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    96(R10), R14
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 120(R10)
+	ORQ  R14, R11
+
+	// exhausted += (br2.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL
+
+skip_fill2:
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v1 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br2.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R11
+	ADDB   CL, R12
+	BSWAPL AX
+
+	// val2 := br2.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v2 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br2.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val3 := br2.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v3 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br2.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R11
+	ADDB   CL, R12
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (BX)(R8*2)
+
+	// update the bitreader structure
+	MOVQ R11, 128(R10)
+	MOVB R12, 136(R10)
+
+	// br3.fillFast32()
+	MOVQ    176(R10), R11
+	MOVBQZX 184(R10), R12
+	CMPQ    R12, $0x20
+	JBE     skip_fill3
+	MOVQ    168(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    144(R10), R14
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 168(R10)
+	ORQ  R14, R11
+
+	// exhausted += (br3.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL
+
+skip_fill3:
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v1 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br3.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R11
+	ADDB   CL, R12
+	BSWAPL AX
+
+	// val2 := br3.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v2 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br3.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R11
+	ADDB CL, R12
+
+	// val3 := br3.peekTopBits(peekBits)
+	MOVQ R11, R13
+	MOVQ DI, CX
+	SHRQ CL, R13
+
+	// v3 := table[val0&mask]
+	MOVW (R9)(R13*2), CX
+
+	// br3.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R11
+	ADDB   CL, R12
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	LEAQ (R8)(R8*2), CX
+	MOVL AX, (BX)(CX*1)
+
+	// update the bitreader structure
+	MOVQ  R11, 176(R10)
+	MOVB  R12, 184(R10)
+	ADDQ  $0x04, BX
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	SUBQ  16(AX), BX
+	SHLQ  $0x02, BX
+	MOVQ  BX, 40(AX)
+	RET
+
+// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+TEXT ·decompress1x_main_loop_amd64(SB), $0-8
+	MOVQ    ctx+0(FP), CX
+	MOVQ    16(CX), DX
+	MOVQ    24(CX), BX
+	CMPQ    BX, $0x04
+	JB      error_max_decoded_size_exceeded
+	LEAQ    (DX)(BX*1), BX
+	MOVQ    (CX), SI
+	MOVQ    (SI), R8
+	MOVQ    24(SI), R9
+	MOVQ    32(SI), R10
+	MOVBQZX 40(SI), R11
+	MOVQ    32(CX), SI
+	MOVBQZX 8(CX), DI
+	JMP     loop_condition
+
+main_loop:
+	// Check if we have room for 4 bytes in the output buffer
+	LEAQ 4(DX), CX
+	CMPQ CX, BX
+	JGE  error_max_decoded_size_exceeded
+
+	// Decode 4 values
+	CMPQ R11, $0x20
+	JL   bitReader_fillFast_1_end
+	SUBQ $0x20, R11
+	SUBQ $0x04, R9
+	MOVL (R8)(R9*1), R12
+	MOVQ R11, CX
+	SHLQ CL, R12
+	ORQ  R12, R10
+
+bitReader_fillFast_1_end:
+	MOVQ    DI, CX
+	MOVQ    R10, R12
+	SHRQ    CL, R12
+	MOVW    (SI)(R12*2), CX
+	MOVB    CH, AL
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLQ    CL, R10
+	MOVQ    DI, CX
+	MOVQ    R10, R12
+	SHRQ    CL, R12
+	MOVW    (SI)(R12*2), CX
+	MOVB    CH, AH
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLQ    CL, R10
+	BSWAPL  AX
+	CMPQ    R11, $0x20
+	JL      bitReader_fillFast_2_end
+	SUBQ    $0x20, R11
+	SUBQ    $0x04, R9
+	MOVL    (R8)(R9*1), R12
+	MOVQ    R11, CX
+	SHLQ    CL, R12
+	ORQ     R12, R10
+
+bitReader_fillFast_2_end:
+	MOVQ    DI, CX
+	MOVQ    R10, R12
+	SHRQ    CL, R12
+	MOVW    (SI)(R12*2), CX
+	MOVB    CH, AH
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLQ    CL, R10
+	MOVQ    DI, CX
+	MOVQ    R10, R12
+	SHRQ    CL, R12
+	MOVW    (SI)(R12*2), CX
+	MOVB    CH, AL
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLQ    CL, R10
+	BSWAPL  AX
+
+	// Store the decoded values
+	MOVL AX, (DX)
+	ADDQ $0x04, DX
+
+loop_condition:
+	CMPQ R9, $0x08
+	JGE  main_loop
+
+	// Update ctx structure
+	MOVQ ctx+0(FP), AX
+	SUBQ 16(AX), DX
+	MOVQ DX, 40(AX)
+	MOVQ (AX), AX
+	MOVQ R9, 24(AX)
+	MOVQ R10, 32(AX)
+	MOVB R11, 40(AX)
+	RET
+
+	// Report error
+error_max_decoded_size_exceeded:
+	MOVQ ctx+0(FP), AX
+	MOVQ $-1, CX
+	MOVQ CX, 40(AX)
+	RET
+
+// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+// Requires: BMI2
+TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
+	MOVQ    ctx+0(FP), CX
+	MOVQ    16(CX), DX
+	MOVQ    24(CX), BX
+	CMPQ    BX, $0x04
+	JB      error_max_decoded_size_exceeded
+	LEAQ    (DX)(BX*1), BX
+	MOVQ    (CX), SI
+	MOVQ    (SI), R8
+	MOVQ    24(SI), R9
+	MOVQ    32(SI), R10
+	MOVBQZX 40(SI), R11
+	MOVQ    32(CX), SI
+	MOVBQZX 8(CX), DI
+	JMP     loop_condition
+
+main_loop:
+	// Check if we have room for 4 bytes in the output buffer
+	LEAQ 4(DX), CX
+	CMPQ CX, BX
+	JGE  error_max_decoded_size_exceeded
+
+	// Decode 4 values
+	CMPQ  R11, $0x20
+	JL    bitReader_fillFast_1_end
+	SUBQ  $0x20, R11
+	SUBQ  $0x04, R9
+	MOVL  (R8)(R9*1), CX
+	SHLXQ R11, CX, CX
+	ORQ   CX, R10
+
+bitReader_fillFast_1_end:
+	SHRXQ   DI, R10, CX
+	MOVW    (SI)(CX*2), CX
+	MOVB    CH, AL
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLXQ   CX, R10, R10
+	SHRXQ   DI, R10, CX
+	MOVW    (SI)(CX*2), CX
+	MOVB    CH, AH
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLXQ   CX, R10, R10
+	BSWAPL  AX
+	CMPQ    R11, $0x20
+	JL      bitReader_fillFast_2_end
+	SUBQ    $0x20, R11
+	SUBQ    $0x04, R9
+	MOVL    (R8)(R9*1), CX
+	SHLXQ   R11, CX, CX
+	ORQ     CX, R10
+
+bitReader_fillFast_2_end:
+	SHRXQ   DI, R10, CX
+	MOVW    (SI)(CX*2), CX
+	MOVB    CH, AH
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLXQ   CX, R10, R10
+	SHRXQ   DI, R10, CX
+	MOVW    (SI)(CX*2), CX
+	MOVB    CH, AL
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLXQ   CX, R10, R10
+	BSWAPL  AX
+
+	// Store the decoded values
+	MOVL AX, (DX)
+	ADDQ $0x04, DX
+
+loop_condition:
+	CMPQ R9, $0x08
+	JGE  main_loop
+
+	// Update ctx structure
+	MOVQ ctx+0(FP), AX
+	SUBQ 16(AX), DX
+	MOVQ DX, 40(AX)
+	MOVQ (AX), AX
+	MOVQ R9, 24(AX)
+	MOVQ R10, 32(AX)
+	MOVB R11, 40(AX)
+	RET
+
+	// Report error
+error_max_decoded_size_exceeded:
+	MOVQ ctx+0(FP), AX
+	MOVQ $-1, CX
+	MOVQ CX, 40(AX)
+	RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
new file mode 100644
index 0000000000..908c17de63
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -0,0 +1,299 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// This file contains a generic implementation of Decoder.Decompress4X.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	buf := d.buffer()
+	var off uint8
+	var decoded int
+
+	// Decode 2 values from each decoder/loop.
+	const bufoff = 256
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		{
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		off += 2
+
+		if off == 0 {
+			if bufoff > dstEvery {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			// There must at least be 3 buffers left.
+			if len(out)-bufoff < dstEvery*3 {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+			//copy(out, buf[0][:])
+			//copy(out[dstEvery:], buf[1][:])
+			//copy(out[dstEvery*2:], buf[2][:])
+			//copy(out[dstEvery*3:], buf[3][:])
+			*(*[bufoff]byte)(out) = buf[0]
+			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+			out = out[bufoff:]
+			decoded += bufoff * 4
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	d.bufs.Put(buf)
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress1X8Bit(dst, src)
+	}
+	var br bitReaderShifted
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:0]
+
+	// Avoid bounds check by always having full sized table.
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	dt := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	bufs := d.buffer()
+	buf := &bufs[0]
+	var off uint8
+
+	for br.off >= 8 {
+		br.fillFast()
+		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+0] = uint8(v.entry >> 8)
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+1] = uint8(v.entry >> 8)
+
+		// Refill
+		br.fillFast()
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+2] = uint8(v.entry >> 8)
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+3] = uint8(v.entry >> 8)
+
+		off += 4
+		if off == 0 {
+			if len(dst)+256 > maxDecodedSize {
+				br.close()
+				d.bufs.Put(bufs)
+				return nil, ErrMaxDecodedSizeExceeded
+			}
+			dst = append(dst, buf[:]...)
+		}
+	}
+
+	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	dst = append(dst, buf[:off]...)
+
+	// br < 8, so uint8 is fine
+	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+	for bitsLeft > 0 {
+		br.fill()
+		if false && br.bitsRead >= 32 {
+			if br.off >= 4 {
+				v := br.in[br.off-4:]
+				v = v[:4]
+				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+				br.value = (br.value << 32) | uint64(low)
+				br.bitsRead -= 32
+				br.off -= 4
+			} else {
+				for br.off > 0 {
+					br.value = (br.value << 8) | uint64(br.in[br.off-1])
+					br.bitsRead -= 8
+					br.off--
+				}
+			}
+		}
+		if len(dst) >= maxDecodedSize {
+			d.bufs.Put(bufs)
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= nBits
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	d.bufs.Put(bufs)
+	return dst, br.close()
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
index 7ec2022b65..77ecd68e0a 100644
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"math"
 	"math/bits"
+	"sync"
 
 	"github.com/klauspost/compress/fse"
 )
@@ -87,7 +88,7 @@ type Scratch struct {
 	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
 	MaxDecodedSize int
 
-	br byteReader
+	srcLen int
 
 	// MaxSymbolValue will override the maximum symbol value of the next block.
 	MaxSymbolValue uint8
@@ -116,6 +117,7 @@ type Scratch struct {
 	nodes          []nodeElt
 	tmpOut         [4][]byte
 	fse            *fse.Scratch
+	decPool        sync.Pool // *[4][256]byte buffers.
 	huffWeight     [maxSymbolValue + 1]byte
 }
 
@@ -168,7 +170,7 @@ func (s *Scratch) prepare(in []byte) (*Scratch, error) {
 	if s.fse == nil {
 		s.fse = &fse.Scratch{}
 	}
-	s.br.init(in)
+	s.srcLen = len(in)
 
 	return s, nil
 }
@@ -245,6 +247,68 @@ func (c cTable) write(s *Scratch) error {
 	return nil
 }
 
+func (c cTable) estTableSize(s *Scratch) (sz int, err error) {
+	var (
+		// precomputed conversion table
+		bitsToWeight [tableLogMax + 1]byte
+		huffLog      = s.actualTableLog
+		// last weight is not saved.
+		maxSymbolValue = uint8(s.symbolLen - 1)
+		huffWeight     = s.huffWeight[:256]
+	)
+	const (
+		maxFSETableLog = 6
+	)
+	// convert to weight
+	bitsToWeight[0] = 0
+	for n := uint8(1); n < huffLog+1; n++ {
+		bitsToWeight[n] = huffLog + 1 - n
+	}
+
+	// Acquire histogram for FSE.
+	hist := s.fse.Histogram()
+	hist = hist[:256]
+	for i := range hist[:16] {
+		hist[i] = 0
+	}
+	for n := uint8(0); n < maxSymbolValue; n++ {
+		v := bitsToWeight[c[n].nBits] & 15
+		huffWeight[n] = v
+		hist[v]++
+	}
+
+	// FSE compress if feasible.
+	if maxSymbolValue >= 2 {
+		huffMaxCnt := uint32(0)
+		huffMax := uint8(0)
+		for i, v := range hist[:16] {
+			if v == 0 {
+				continue
+			}
+			huffMax = byte(i)
+			if v > huffMaxCnt {
+				huffMaxCnt = v
+			}
+		}
+		s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
+		s.fse.TableLog = maxFSETableLog
+		b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
+		if err == nil && len(b) < int(s.symbolLen>>1) {
+			sz += 1 + len(b)
+			return sz, nil
+		}
+		// Unable to compress (RLE/uncompressible)
+	}
+	// write raw values as 4-bits (max : 15)
+	if maxSymbolValue > (256 - 128) {
+		// should not happen : likely means source cannot be compressed
+		return 0, ErrIncompressible
+	}
+	// special case, pack weights 4 bits/weight.
+	sz += 1 + int(maxSymbolValue/2)
+	return sz, nil
+}
+
 // estimateSize returns the estimated size in bytes of the input represented in the
 // histogram supplied.
 func (c cTable) estimateSize(hist []uint32) int {
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
new file mode 100644
index 0000000000..3954c51219
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
@@ -0,0 +1,34 @@
+// Package cpuinfo gives runtime info about the current CPU.
+//
+// This is a very limited module meant for use internally
+// in this project. For more versatile solution check
+// https://github.com/klauspost/cpuid.
+package cpuinfo
+
+// HasBMI1 checks whether an x86 CPU supports the BMI1 extension.
+func HasBMI1() bool {
+	return hasBMI1
+}
+
+// HasBMI2 checks whether an x86 CPU supports the BMI2 extension.
+func HasBMI2() bool {
+	return hasBMI2
+}
+
+// DisableBMI2 will disable BMI2, for testing purposes.
+// Call returned function to restore previous state.
+func DisableBMI2() func() {
+	old := hasBMI2
+	hasBMI2 = false
+	return func() {
+		hasBMI2 = old
+	}
+}
+
+// HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions.
+func HasBMI() bool {
+	return HasBMI1() && HasBMI2()
+}
+
+var hasBMI1 bool
+var hasBMI2 bool
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
new file mode 100644
index 0000000000..e802579c4f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
@@ -0,0 +1,11 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package cpuinfo
+
+// go:noescape
+func x86extensions() (bmi1, bmi2 bool)
+
+func init() {
+	hasBMI1, hasBMI2 = x86extensions()
+}
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
new file mode 100644
index 0000000000..4465fbe9e9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
@@ -0,0 +1,36 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+TEXT ·x86extensions(SB), NOSPLIT, $0
+	// 1. determine max EAX value
+	XORQ AX, AX
+	CPUID
+
+	CMPQ AX, $7
+	JB   unsupported
+
+	// 2. EAX = 7, ECX = 0 --- see Table 3-8 "Information Returned by CPUID Instruction"
+	MOVQ $7, AX
+	MOVQ $0, CX
+	CPUID
+
+	BTQ   $3, BX // bit 3 = BMI1
+	SETCS AL
+
+	BTQ   $8, BX // bit 8 = BMI2
+	SETCS AH
+
+	MOVB AL, bmi1+0(FP)
+	MOVB AH, bmi2+1(FP)
+	RET
+
+unsupported:
+	XORQ AX, AX
+	MOVB AL, bmi1+0(FP)
+	MOVB AL, bmi2+1(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/internal/godebug/godebug.go b/vendor/github.com/klauspost/compress/internal/godebug/godebug.go
new file mode 100644
index 0000000000..ff13f2a020
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/godebug/godebug.go
@@ -0,0 +1,44 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package godebug makes the simplified settings in the $GODEBUG environment variable
+// available to packages.
+// Needed since internal/godebug is not available here.
+package godebug
+
+import "os"
+
+func Get(key string) string {
+	s := os.Getenv("GODEBUG")
+	if s == "" {
+		return ""
+	}
+	// Scan the string backward so that later settings are used
+	// and earlier settings are ignored.
+	// Note that a forward scan would cause cached values
+	// to temporarily use the ignored value before being
+	// updated to the "correct" one.
+	end := len(s)
+	eq := -1
+	for i := end - 1; i >= -1; i-- {
+		if i == -1 || s[i] == ',' {
+			if eq >= 0 {
+				name, arg := s[i+1:eq], s[eq+1:end]
+				if name == key {
+					for j := 0; j < len(arg); j++ {
+						if arg[j] == '#' {
+							return arg[:j]
+						}
+					}
+					return arg
+				}
+			}
+			eq = -1
+			end = i
+		} else if s[i] == '=' {
+			eq = i
+		}
+	}
+	return ""
+}
diff --git a/vendor/github.com/klauspost/compress/internal/race/norace.go b/vendor/github.com/klauspost/compress/internal/race/norace.go
new file mode 100644
index 0000000000..affbbbb595
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/race/norace.go
@@ -0,0 +1,13 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !race
+
+package race
+
+func ReadSlice[T any](s []T) {
+}
+
+func WriteSlice[T any](s []T) {
+}
diff --git a/vendor/github.com/klauspost/compress/internal/race/race.go b/vendor/github.com/klauspost/compress/internal/race/race.go
new file mode 100644
index 0000000000..f5e240dcde
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/race/race.go
@@ -0,0 +1,26 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build race
+
+package race
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+func ReadSlice[T any](s []T) {
+	if len(s) == 0 {
+		return
+	}
+	runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0])))
+}
+
+func WriteSlice[T any](s []T) {
+	if len(s) == 0 {
+		return
+	}
+	runtime.RaceWriteRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0])))
+}
diff --git a/vendor/github.com/golang/snappy/LICENSE b/vendor/github.com/klauspost/compress/internal/snapref/LICENSE
similarity index 100%
rename from vendor/github.com/golang/snappy/LICENSE
rename to vendor/github.com/klauspost/compress/internal/snapref/LICENSE
diff --git a/vendor/github.com/golang/snappy/decode.go b/vendor/github.com/klauspost/compress/internal/snapref/decode.go
similarity index 99%
rename from vendor/github.com/golang/snappy/decode.go
rename to vendor/github.com/klauspost/compress/internal/snapref/decode.go
index 23c6e26c6b..40796a49d6 100644
--- a/vendor/github.com/golang/snappy/decode.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/decode.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package snappy
+package snapref
 
 import (
 	"encoding/binary"
diff --git a/vendor/github.com/golang/snappy/decode_other.go b/vendor/github.com/klauspost/compress/internal/snapref/decode_other.go
similarity index 98%
rename from vendor/github.com/golang/snappy/decode_other.go
rename to vendor/github.com/klauspost/compress/internal/snapref/decode_other.go
index 2f672be557..77395a6b8b 100644
--- a/vendor/github.com/golang/snappy/decode_other.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/decode_other.go
@@ -2,9 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!arm64 appengine !gc noasm
-
-package snappy
+package snapref
 
 // decode writes the decoding of src to dst. It assumes that the varint-encoded
 // length of the decompressed bytes has already been read, and that len(dst)
diff --git a/vendor/github.com/golang/snappy/encode.go b/vendor/github.com/klauspost/compress/internal/snapref/encode.go
similarity index 99%
rename from vendor/github.com/golang/snappy/encode.go
rename to vendor/github.com/klauspost/compress/internal/snapref/encode.go
index 7f23657076..13c6040a5d 100644
--- a/vendor/github.com/golang/snappy/encode.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/encode.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package snappy
+package snapref
 
 import (
 	"encoding/binary"
diff --git a/vendor/github.com/golang/snappy/encode_other.go b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
similarity index 92%
rename from vendor/github.com/golang/snappy/encode_other.go
rename to vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
index 296d7f0beb..2754bac6f1 100644
--- a/vendor/github.com/golang/snappy/encode_other.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
@@ -2,9 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!arm64 appengine !gc noasm
-
-package snappy
+package snapref
 
 func load32(b []byte, i int) uint32 {
 	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
@@ -20,6 +18,7 @@ func load64(b []byte, i int) uint64 {
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 //
 // It assumes that:
+//
 //	dst is long enough to hold the encoded bytes
 //	1 <= len(lit) && len(lit) <= 65536
 func emitLiteral(dst, lit []byte) int {
@@ -44,6 +43,7 @@ func emitLiteral(dst, lit []byte) int {
 // emitCopy writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that:
+//
 //	dst is long enough to hold the encoded bytes
 //	1 <= offset && offset <= 65535
 //	4 <= length && length <= 65535
@@ -51,7 +51,7 @@ func emitCopy(dst []byte, offset, length int) int {
 	i := 0
 	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
 	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
-	// length emitted down below is is a little lower (at 60 = 64 - 4), because
+	// length emitted down below is a little lower (at 60 = 64 - 4), because
 	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
 	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
 	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
@@ -87,28 +87,40 @@ func emitCopy(dst []byte, offset, length int) int {
 	return i + 2
 }
 
-// extendMatch returns the largest k such that k <= len(src) and that
-// src[i:i+k-j] and src[j:k] have the same contents.
-//
-// It assumes that:
-//	0 <= i && i < j && j <= len(src)
-func extendMatch(src []byte, i, j int) int {
-	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
-	}
-	return j
-}
-
 func hash(u, shift uint32) uint32 {
 	return (u * 0x1e35a7bd) >> shift
 }
 
+// EncodeBlockInto exposes encodeBlock but checks dst size.
+func EncodeBlockInto(dst, src []byte) (d int) {
+	if MaxEncodedLen(len(src)) > len(dst) {
+		return 0
+	}
+
+	// encodeBlock breaks on too big blocks, so split.
+	for len(src) > 0 {
+		p := src
+		src = nil
+		if len(p) > maxBlockSize {
+			p, src = p[:maxBlockSize], p[maxBlockSize:]
+		}
+		if len(p) < minNonLiteralBlockSize {
+			d += emitLiteral(dst[d:], p)
+		} else {
+			d += encodeBlock(dst[d:], p)
+		}
+	}
+	return d
+}
+
 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlock(dst, src []byte) (d int) {
 	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
 	// The table element type is uint16, as s < sLimit and sLimit < len(src)
diff --git a/vendor/github.com/klauspost/compress/snappy/snappy.go b/vendor/github.com/klauspost/compress/internal/snapref/snappy.go
similarity index 97%
rename from vendor/github.com/klauspost/compress/snappy/snappy.go
rename to vendor/github.com/klauspost/compress/internal/snapref/snappy.go
index 74a36689e8..34d01f4aa6 100644
--- a/vendor/github.com/klauspost/compress/snappy/snappy.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/snappy.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Package snappy implements the Snappy compression format. It aims for very
+// Package snapref implements the Snappy compression format. It aims for very
 // high speeds and reasonable compression.
 //
 // There are actually two Snappy formats: block and stream. They are related,
@@ -17,7 +17,7 @@
 //
 // The canonical, C++ implementation is at https://github.com/google/snappy and
 // it only implements the block format.
-package snappy
+package snapref
 
 import (
 	"hash/crc32"
diff --git a/vendor/github.com/golang/snappy/.gitignore b/vendor/github.com/klauspost/compress/s2/.gitignore
similarity index 92%
rename from vendor/github.com/golang/snappy/.gitignore
rename to vendor/github.com/klauspost/compress/s2/.gitignore
index 042091d9b3..3a89c6e3e2 100644
--- a/vendor/github.com/golang/snappy/.gitignore
+++ b/vendor/github.com/klauspost/compress/s2/.gitignore
@@ -1,4 +1,3 @@
-cmd/snappytool/snappytool
 testdata/bench
 
 # These explicitly listed benchmark data files are for an obsolete version of
diff --git a/vendor/github.com/klauspost/compress/snappy/LICENSE b/vendor/github.com/klauspost/compress/s2/LICENSE
similarity index 96%
rename from vendor/github.com/klauspost/compress/snappy/LICENSE
rename to vendor/github.com/klauspost/compress/s2/LICENSE
index 6050c10f4c..1d2d645bd9 100644
--- a/vendor/github.com/klauspost/compress/snappy/LICENSE
+++ b/vendor/github.com/klauspost/compress/s2/LICENSE
@@ -1,4 +1,5 @@
 Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+Copyright (c) 2019 Klaus Post. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md
new file mode 100644
index 0000000000..8284bb0810
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/README.md
@@ -0,0 +1,1120 @@
+# S2 Compression
+
+S2 is an extension of [Snappy](https://github.com/google/snappy).
+
+S2 is aimed for high throughput, which is why it features concurrent compression for bigger payloads.
+
+Decoding is compatible with Snappy compressed content, but content compressed with S2 cannot be decompressed by Snappy.
+This means that S2 can seamlessly replace Snappy without converting compressed content.
+
+S2 can produce Snappy compatible output, faster and better than Snappy.
+If you want full benefit of the changes you should use s2 without Snappy compatibility. 
+
+S2 is designed to have high throughput on content that cannot be compressed.
+This is important, so you don't have to worry about spending CPU cycles on already compressed data. 
+
+## Benefits over Snappy
+
+* Better compression
+* Adjustable compression (3 levels) 
+* Concurrent stream compression
+* Faster decompression, even for Snappy compatible content
+* Concurrent Snappy/S2 stream decompression
+* Skip forward in compressed stream
+* Random seeking with indexes
+* Compatible with reading Snappy compressed content
+* Smaller block size overhead on incompressible blocks
+* Block concatenation
+* Block Dictionary support
+* Uncompressed stream mode
+* Automatic stream size padding
+* Snappy compatible block compression
+
+## Drawbacks over Snappy
+
+* Not optimized for 32 bit systems
+* Streams use slightly more memory due to larger blocks and concurrency (configurable)
+
+# Usage
+
+Installation: `go get -u github.com/klauspost/compress/s2`
+
+Full package documentation:
+ 
+[![godoc][1]][2]
+
+[1]: https://godoc.org/github.com/klauspost/compress?status.svg
+[2]: https://godoc.org/github.com/klauspost/compress/s2
+
+## Compression
+
+```Go
+func EncodeStream(src io.Reader, dst io.Writer) error {
+    enc := s2.NewWriter(dst)
+    _, err := io.Copy(enc, src)
+    if err != nil {
+        enc.Close()
+        return err
+    }
+    // Blocks until compression is done.
+    return enc.Close() 
+}
+```
+
+You should always call `enc.Close()`, otherwise you will leak resources and your encode will be incomplete.
+
+For the best throughput, you should attempt to reuse the `Writer` using the `Reset()` method.
+
+The Writer in S2 is always buffered, therefore `NewBufferedWriter` in Snappy can be replaced with `NewWriter` in S2.
+It is possible to flush any buffered data using the `Flush()` method. 
+This will block until all data sent to the encoder has been written to the output.
+
+S2 also supports the `io.ReaderFrom` interface, which will consume all input from a reader.
+
+As a final method to compress data, if you have a single block of data you would like to have encoded as a stream,
+a slightly more efficient method is to use the `EncodeBuffer` method.
+This will take ownership of the buffer until the stream is closed.
+
+```Go
+func EncodeStream(src []byte, dst io.Writer) error {
+    enc := s2.NewWriter(dst)
+    // The encoder owns the buffer until Flush or Close is called.
+    err := enc.EncodeBuffer(buf)
+    if err != nil {
+        enc.Close()
+        return err
+    }
+    // Blocks until compression is done.
+    return enc.Close()
+}
+```
+
+Each call to `EncodeBuffer` will result in discrete blocks being created without buffering, 
+so it should only be used a single time per stream.
+If you need to write several blocks, you should use the regular io.Writer interface.
+
+
+## Decompression
+
+```Go
+func DecodeStream(src io.Reader, dst io.Writer) error {
+    dec := s2.NewReader(src)
+    _, err := io.Copy(dst, dec)
+    return err
+}
+```
+
+Similar to the Writer, a Reader can be reused using the `Reset` method.
+
+For the best possible throughput, there is a `EncodeBuffer(buf []byte)` function available.
+However, it requires that the provided buffer isn't used after it is handed over to S2 and until the stream is flushed or closed.  
+
+For smaller data blocks, there is also a non-streaming interface: `Encode()`, `EncodeBetter()` and `Decode()`.
+Do however note that these functions (similar to Snappy) does not provide validation of data, 
+so data corruption may be undetected. Stream encoding provides CRC checks of data.
+
+It is possible to efficiently skip forward in a compressed stream using the `Skip()` method. 
+For big skips the decompressor is able to skip blocks without decompressing them.
+
+## Single Blocks
+
+Similar to Snappy S2 offers single block compression. 
+Blocks do not offer the same flexibility and safety as streams,
+but may be preferable for very small payloads, less than 100K.
+
+Using a simple `dst := s2.Encode(nil, src)` will compress `src` and return the compressed result. 
+It is possible to provide a destination buffer. 
+If the buffer has a capacity of `s2.MaxEncodedLen(len(src))` it will be used. 
+If not a new will be allocated. 
+
+Alternatively `EncodeBetter`/`EncodeBest` can also be used for better, but slightly slower compression.
+
+Similarly to decompress a block you can use `dst, err := s2.Decode(nil, src)`. 
+Again an optional destination buffer can be supplied. 
+The `s2.DecodedLen(src)` can be used to get the minimum capacity needed. 
+If that is not satisfied a new buffer will be allocated.
+
+Block function always operate on a single goroutine since it should only be used for small payloads.
+
+# Commandline tools
+
+Some very simply commandline tools are provided; `s2c` for compression and `s2d` for decompression.
+
+Binaries can be downloaded on the [Releases Page](https://github.com/klauspost/compress/releases).
+
+Installing then requires Go to be installed. To install them, use:
+
+`go install github.com/klauspost/compress/s2/cmd/s2c@latest && go install github.com/klauspost/compress/s2/cmd/s2d@latest`
+
+To build binaries to the current folder use:
+
+`go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d`
+
+
+## s2c
+
+```
+Usage: s2c [options] file1 file2
+
+Compresses all files supplied as input separately.
+Output files are written as 'filename.ext.s2' or 'filename.ext.snappy'.
+By default output files will be overwritten.
+Use - as the only file name to read from stdin and write to stdout.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+File names beginning with 'http://' and 'https://' will be downloaded and compressed.
+Only http response code 200 is accepted.
+
+Options:
+  -bench int
+    	Run benchmark n times. No output will be written
+  -blocksize string
+    	Max  block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "4M")
+  -c	Write all output to stdout. Multiple input files will be concatenated
+  -cpu int
+    	Compress using this amount of threads (default 32)
+  -faster
+    	Compress faster, but with a minor compression loss
+  -help
+    	Display help
+  -index
+        Add seek index (default true)    	
+  -o string
+        Write output to another file. Single input file only
+  -pad string
+    	Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1")
+  -q	Don't write any output to terminal, except errors
+  -rm
+    	Delete source file(s) after successful compression
+  -safe
+    	Do not overwrite output files
+  -slower
+    	Compress more, but a lot slower
+  -snappy
+        Generate Snappy compatible output stream
+  -verify
+    	Verify written files  
+
+```
+
+## s2d
+
+```
+Usage: s2d [options] file1 file2
+
+Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'.
+Output file names have the extension removed. By default output files will be overwritten.
+Use - as the only file name to read from stdin and write to stdout.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+File names beginning with 'http://' and 'https://' will be downloaded and decompressed.
+Extensions on downloaded files are ignored. Only http response code 200 is accepted.
+
+Options:
+  -bench int
+    	Run benchmark n times. No output will be written
+  -c	Write all output to stdout. Multiple input files will be concatenated
+  -help
+    	Display help
+  -o string
+        Write output to another file. Single input file only
+  -offset string
+        Start at offset. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
+  -q    Don't write any output to terminal, except errors
+  -rm
+        Delete source file(s) after successful decompression
+  -safe
+        Do not overwrite output files
+  -tail string
+        Return last of compressed file. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
+  -verify
+    	Verify files, but do not write output                                      
+```
+
+## s2sx: self-extracting archives
+
+s2sx allows creating self-extracting archives with no dependencies.
+
+By default, executables are created for the same platforms as the host os, 
+but this can be overridden with `-os` and `-arch` parameters.
+
+Extracted files have 0666 permissions, except when untar option used.
+
+```
+Usage: s2sx [options] file1 file2
+
+Compresses all files supplied as input separately.
+If files have '.s2' extension they are assumed to be compressed already.
+Output files are written as 'filename.s2sx' and with '.exe' for windows targets.
+If output is big, an additional file with ".more" is written. This must be included as well.
+By default output files will be overwritten.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+Options:
+  -arch string
+        Destination architecture (default "amd64")
+  -c    Write all output to stdout. Multiple input files will be concatenated
+  -cpu int
+        Compress using this amount of threads (default 32)
+  -help
+        Display help
+  -max string
+        Maximum executable size. Rest will be written to another file. (default "1G")
+  -os string
+        Destination operating system (default "windows")
+  -q    Don't write any output to terminal, except errors
+  -rm
+        Delete source file(s) after successful compression
+  -safe
+        Do not overwrite output files
+  -untar
+        Untar on destination
+```
+
+Available platforms are:
+
+ * darwin-amd64
+ * darwin-arm64
+ * linux-amd64
+ * linux-arm
+ * linux-arm64
+ * linux-mips64
+ * linux-ppc64le
+ * windows-386
+ * windows-amd64                                                                             
+
+By default, there is a size limit of 1GB for the output executable.
+
+When this is exceeded the remaining file content is written to a file called
+output+`.more`. This file must be included for a successful extraction and 
+placed alongside the executable for a successful extraction.
+
+This file *must* have the same name as the executable, so if the executable is renamed, 
+so must the `.more` file. 
+
+This functionality is disabled with stdin/stdout. 
+
+### Self-extracting TAR files
+
+If you wrap a TAR file you can specify `-untar` to make it untar on the destination host.
+
+Files are extracted to the current folder with the path specified in the tar file.
+
+Note that tar files are not validated before they are wrapped.
+
+For security reasons files that move below the root folder are not allowed.
+
+# Performance
+
+This section will focus on comparisons to Snappy. 
+This package is solely aimed at replacing Snappy as a high speed compression package.
+If you are mainly looking for better compression [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd)
+gives better compression, but typically at speeds slightly below "better" mode in this package.
+
+Compression is increased compared to Snappy, mostly around 5-20% and the throughput is typically 25-40% increased (single threaded) compared to the Snappy Go implementation.
+
+Streams are concurrently compressed. The stream will be distributed among all available CPU cores for the best possible throughput.
+
+A "better" compression mode is also available. This allows to trade a bit of speed for a minor compression gain.
+The content compressed in this mode is fully compatible with the standard decoder.
+
+Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU):
+
+| File                                                                                                    | S2 Speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
+|---------------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
+| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                        | 16.33x   | 10556 MB/s    | 8.0%         | 6.04x       | 5252 MB/s           | 14.7%              |
+| (1 CPU)                                                                                                 | 1.08x    | 940 MB/s      | -            | 0.46x       | 400 MB/s            | -                  |
+| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst)     | 16.51x   | 15224 MB/s    | 31.70%       | 9.47x       | 8734 MB/s           | 37.71%             |
+| (1 CPU)                                                                                                 | 1.26x    | 1157 MB/s     | -            | 0.60x       | 556 MB/s            | -                  |
+| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)             | 15.14x   | 12598 MB/s    | -5.76%       | 6.23x       | 5675 MB/s           | 3.62%              |
+| (1 CPU)                                                                                                 | 1.02x    | 932 MB/s      | -            | 0.47x       | 432 MB/s            | -                  |
+| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                         | 11.21x   | 12116 MB/s    | 15.95%       | 3.24x       | 3500 MB/s           | 18.00%             |
+| (1 CPU)                                                                                                 | 1.05x    | 1135 MB/s     | -            | 0.27x       | 292 MB/s            | -                  |
+| [apache.log](https://files.klauspost.com/compress/apache.log.zst)                                       | 8.55x    | 16673 MB/s    | 20.54%       | 5.85x       | 11420 MB/s          | 24.97%             |
+| (1 CPU)                                                                                                 | 1.91x    | 1771 MB/s     | -            | 0.53x       | 1041 MB/s           | -                  |
+| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                        | 15.76x   | 14357 MB/s    | 24.01%       | 8.67x       | 7891 MB/s           | 33.68%             |
+| (1 CPU)                                                                                                 | 1.17x    | 1064 MB/s     | -            | 0.65x       | 595 MB/s            | -                  |
+| [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                         | 13.33x   | 9835 MB/s     | 2.34%        | 6.85x       | 4863 MB/s           | 9.96%              |
+| (1 CPU)                                                                                                 | 0.97x    | 689 MB/s      | -            | 0.55x       | 387 MB/s            | -                  |
+| sharnd.out.2gb                                                                                          | 9.11x    | 13213 MB/s    | 0.01%        | 1.49x       | 9184 MB/s           | 0.01%              |
+| (1 CPU)                                                                                                 | 0.88x    | 5418 MB/s     | -            | 0.77x       | 5417 MB/s           | -                  |
+| [sofia-air-quality-dataset csv](https://files.klauspost.com/compress/sofia-air-quality-dataset.tar.zst) | 22.00x   | 11477 MB/s    | 18.73%       | 11.15x      | 5817 MB/s           | 27.88%             |
+| (1 CPU)                                                                                                 | 1.23x    | 642 MB/s      | -            | 0.71x       | 642 MB/s            | -                  |
+| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                        | 11.23x   | 6520 MB/s     | 5.9%         | 5.35x       | 3109 MB/s           | 15.88%             |
+| (1 CPU)                                                                                                 | 1.05x    | 607 MB/s      | -            | 0.52x       | 304 MB/s            | -                  |
+| [enwik9](https://files.klauspost.com/compress/enwik9.zst)                                               | 19.28x   | 8440 MB/s     | 4.04%        | 9.31x       | 4076 MB/s           | 18.04%             |
+| (1 CPU)                                                                                                 | 1.12x    | 488 MB/s      | -            | 0.57x       | 250 MB/s            | -                  |
+
+### Legend
+
+* `S2 Speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
+* `S2 Throughput`: Throughput of S2 in MB/s. 
+* `S2 % smaller`: How many percent of the Snappy output size is S2 better.
+* `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy. 
+* `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy. 
+* `"better" % smaller`: How many percent of the Snappy output size is S2 better when using "better" compression.
+
+There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads.
+
+Machine generated data gets by far the biggest compression boost, with size being reduced by up to 35% of Snappy size.
+
+The "better" compression mode sees a good improvement in all cases, but usually at a performance cost.
+
+Incompressible content (`sharnd.out.2gb`, 2GB random data) sees the smallest speedup. 
+This is likely dominated by synchronization overhead, which is confirmed by the fact that single threaded performance is higher (see above). 
+
+## Decompression
+
+S2 attempts to create content that is also fast to decompress, except in "better" mode where the smallest representation is used.
+
+S2 vs Snappy **decompression** speed. Both operating on single core:
+
+| File                                                                                                | S2 Throughput | vs. Snappy | Better Throughput | vs. Snappy |
+|-----------------------------------------------------------------------------------------------------|---------------|------------|-------------------|------------|
+| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                    | 2117 MB/s     | 1.14x      | 1738 MB/s         | 0.94x      |
+| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 2401 MB/s     | 1.25x      | 2307 MB/s         | 1.20x      |
+| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)         | 2075 MB/s     | 0.98x      | 1764 MB/s         | 0.83x      |
+| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                     | 2967 MB/s     | 1.05x      | 2885 MB/s         | 1.02x      |
+| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst)                             | 4141 MB/s     | 1.07x      | 4184 MB/s         | 1.08x      |
+| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                    | 2264 MB/s     | 1.12x      | 2185 MB/s         | 1.08x      |
+| [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                     | 1525 MB/s     | 1.03x      | 1347 MB/s         | 0.91x      |
+| sharnd.out.2gb                                                                                      | 3813 MB/s     | 0.79x      | 3900 MB/s         | 0.81x      |
+| [enwik9](http://mattmahoney.net/dc/textdata.html)                                                   | 1246 MB/s     | 1.29x      | 967 MB/s          | 1.00x      |
+| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                    | 1433 MB/s     | 1.12x      | 1203 MB/s         | 0.94x      |
+| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results)                                 | 1284 MB/s     | 1.32x      | 1010 MB/s         | 1.04x      |
+
+### Legend
+
+* `S2 Throughput`: Decompression speed of S2 encoded content.
+* `Better Throughput`: Decompression speed of S2 "better" encoded content.
+* `vs Snappy`: Decompression speed of S2 "better" mode compared to Snappy and absolute speed.
+
+
+While the decompression code hasn't changed, there is a significant speedup in decompression speed. 
+S2 prefers longer matches and will typically only find matches that are 6 bytes or longer. 
+While this reduces compression a bit, it improves decompression speed.
+
+The "better" compression mode will actively look for shorter matches, which is why it has a decompression speed quite similar to Snappy.   
+
+Without assembly decompression is also very fast; single goroutine decompression speed. No assembly:
+
+| File                           | S2 Throughput | S2 throughput |
+|--------------------------------|---------------|---------------|
+| consensus.db.10gb.s2           | 1.84x         | 2289.8 MB/s   |
+| 10gb.tar.s2                    | 1.30x         | 867.07 MB/s   |
+| rawstudio-mint14.tar.s2        | 1.66x         | 1329.65 MB/s  |
+| github-june-2days-2019.json.s2 | 2.36x         | 1831.59 MB/s  |
+| github-ranks-backup.bin.s2     | 1.73x         | 1390.7 MB/s   |
+| enwik9.s2                      | 1.67x         | 681.53 MB/s   |
+| adresser.json.s2               | 3.41x         | 4230.53 MB/s  |
+| silesia.tar.s2                 | 1.52x         | 811.58        |
+
+Even though S2 typically compresses better than Snappy, decompression speed is always better. 
+
+### Concurrent Stream Decompression
+
+For full stream decompression S2 offers a [DecodeConcurrent](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.DecodeConcurrent) 
+that will decode a full stream using multiple goroutines.
+
+Example scaling, AMD Ryzen 3950X, 16 cores, decompression using `s2d -bench=3 <input>`, best of 3: 
+
+| Input                                     | `-cpu=1`   | `-cpu=2`   | `-cpu=4`   | `-cpu=8`   | `-cpu=16`   |
+|-------------------------------------------|------------|------------|------------|------------|-------------|
+| enwik10.snappy                            | 1098.6MB/s | 1819.8MB/s | 3625.6MB/s | 6910.6MB/s | 10818.2MB/s |
+| enwik10.s2                                | 1303.5MB/s | 2606.1MB/s | 4847.9MB/s | 8878.4MB/s | 9592.1MB/s  |
+| sofia-air-quality-dataset.tar.snappy      | 1302.0MB/s | 2165.0MB/s | 4244.5MB/s | 8241.0MB/s | 12920.5MB/s |
+| sofia-air-quality-dataset.tar.s2          | 1399.2MB/s | 2463.2MB/s | 5196.5MB/s | 9639.8MB/s | 11439.5MB/s |
+| sofia-air-quality-dataset.tar.s2 (no asm) | 837.5MB/s  | 1652.6MB/s | 3183.6MB/s | 5945.0MB/s | 9620.7MB/s  |
+
+Scaling can be expected to be pretty linear until memory bandwidth is saturated. 
+
+For now the DecodeConcurrent can only be used for full streams without seeking or combining with regular reads.
+
+## Block compression
+
+
+When compressing blocks no concurrent compression is performed just as Snappy. 
+This is because blocks are for smaller payloads and generally will not benefit from concurrent compression.
+
+An important change is that incompressible blocks will not be more than at most 10 bytes bigger than the input.
+In rare, worst case scenario Snappy blocks could be significantly bigger than the input.  
+
+### Mixed content blocks
+
+The most reliable is a wide dataset. 
+For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
+53927 files, total input size: 4,014,735,833 bytes. Single goroutine used.
+
+| *                 | Input      | Output     | Reduction  | MB/s       |
+|-------------------|------------|------------|------------|------------|
+| S2                | 4014735833 | 1059723369 | 73.60%     | **936.73** |
+| S2 Better         | 4014735833 | 961580539  | 76.05%     | 451.10     |
+| S2 Best           | 4014735833 | 899182886  | **77.60%** | 46.84      |
+| Snappy            | 4014735833 | 1128706759 | 71.89%     | 790.15     |
+| S2, Snappy Output | 4014735833 | 1093823291 | 72.75%     | 936.60     |
+| LZ4               | 4014735833 | 1063768713 | 73.50%     | 452.02     |
+
+S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best".
+"Better" mode provides the same compression speed as LZ4 with better compression ratio. 
+
+When outputting Snappy compatible output it still delivers better throughput (150MB/s more) and better compression.
+
+As can be seen from the other benchmarks decompression should also be easier on the S2 generated output.
+
+Though they cannot be compared due to different decompression speeds here are the speed/size comparisons for
+other Go compressors:
+
+| *                 | Input      | Output     | Reduction | MB/s   |
+|-------------------|------------|------------|-----------|--------|
+| Zstd Fastest (Go) | 4014735833 | 794608518  | 80.21%    | 236.04 |
+| Zstd Best (Go)    | 4014735833 | 704603356  | 82.45%    | 35.63  |
+| Deflate (Go) l1   | 4014735833 | 871294239  | 78.30%    | 214.04 |
+| Deflate (Go) l9   | 4014735833 | 730389060  | 81.81%    | 41.17  |
+
+### Standard block compression
+
+Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
+So individual benchmarks should only be seen as a guideline and the overall picture is more important.
+
+These micro-benchmarks are with data in cache and trained branch predictors. For a more realistic benchmark see the mixed content above. 
+
+Block compression. Parallel benchmark running on 16 cores, 16 goroutines.
+
+AMD64 assembly is use for both S2 and Snappy.
+
+| Absolute Perf         | Snappy size | S2 Size | Snappy Speed | S2 Speed    | Snappy dec  | S2 dec      |
+|-----------------------|-------------|---------|--------------|-------------|-------------|-------------|
+| html                  | 22843       | 20868   | 16246 MB/s   | 18617 MB/s  | 40972 MB/s  | 49263 MB/s  |
+| urls.10K              | 335492      | 286541  | 7943 MB/s    | 10201 MB/s  | 22523 MB/s  | 26484 MB/s  |
+| fireworks.jpeg        | 123034      | 123100  | 349544 MB/s  | 303228 MB/s | 718321 MB/s | 827552 MB/s |
+| fireworks.jpeg (200B) | 146         | 155     | 8869 MB/s    | 20180 MB/s  | 33691 MB/s  | 52421 MB/s  |
+| paper-100k.pdf        | 85304       | 84202   | 167546 MB/s  | 112988 MB/s | 326905 MB/s | 291944 MB/s |
+| html_x_4              | 92234       | 20870   | 15194 MB/s   | 54457 MB/s  | 30843 MB/s  | 32217 MB/s  |
+| alice29.txt           | 88034       | 85934   | 5936 MB/s    | 6540 MB/s   | 12882 MB/s  | 20044 MB/s  |
+| asyoulik.txt          | 77503       | 79575   | 5517 MB/s    | 6657 MB/s   | 12735 MB/s  | 22806 MB/s  |
+| lcet10.txt            | 234661      | 220383  | 6235 MB/s    | 6303 MB/s   | 14519 MB/s  | 18697 MB/s  |
+| plrabn12.txt          | 319267      | 318196  | 5159 MB/s    | 6074 MB/s   | 11923 MB/s  | 19901 MB/s  |
+| geo.protodata         | 23335       | 18606   | 21220 MB/s   | 25432 MB/s  | 56271 MB/s  | 62540 MB/s  |
+| kppkn.gtb             | 69526       | 65019   | 9732 MB/s    | 8905 MB/s   | 18491 MB/s  | 18969 MB/s  |
+| alice29.txt (128B)    | 80          | 82      | 6691 MB/s    | 17179 MB/s  | 31883 MB/s  | 38874 MB/s  |
+| alice29.txt (1000B)   | 774         | 774     | 12204 MB/s   | 13273 MB/s  | 48056 MB/s  | 52341 MB/s  |
+| alice29.txt (10000B)  | 6648        | 6933    | 10044 MB/s   | 12824 MB/s  | 32378 MB/s  | 46322 MB/s  |
+| alice29.txt (20000B)  | 12686       | 13516   | 7733 MB/s    | 12160 MB/s  | 30566 MB/s  | 58969 MB/s  |
+
+
+Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size. 
+
+Decompression speed is better than Snappy, except in one case. 
+
+Since payloads are very small the variance in terms of size is rather big, so they should only be seen as a general guideline.
+
+Size is on average around Snappy, but varies on content type. 
+In cases where compression is worse, it usually is compensated by a speed boost. 
+
+
+### Better compression
+
+Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
+So individual benchmarks should only be seen as a guideline and the overall picture is more important.
+
+| Absolute Perf         | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec  | Better dec  |
+|-----------------------|-------------|-------------|--------------|--------------|-------------|-------------|
+| html                  | 22843       | 18972       | 16246 MB/s   | 8621 MB/s    | 40972 MB/s  | 40292 MB/s  |
+| urls.10K              | 335492      | 248079      | 7943 MB/s    | 5104 MB/s    | 22523 MB/s  | 20981 MB/s  |
+| fireworks.jpeg        | 123034      | 123100      | 349544 MB/s  | 84429 MB/s   | 718321 MB/s | 823698 MB/s |
+| fireworks.jpeg (200B) | 146         | 149         | 8869 MB/s    | 7125 MB/s    | 33691 MB/s  | 30101 MB/s  |
+| paper-100k.pdf        | 85304       | 82887       | 167546 MB/s  | 11087 MB/s   | 326905 MB/s | 198869 MB/s |
+| html_x_4              | 92234       | 18982       | 15194 MB/s   | 29316 MB/s   | 30843 MB/s  | 30937 MB/s  |
+| alice29.txt           | 88034       | 71611       | 5936 MB/s    | 3709 MB/s    | 12882 MB/s  | 16611 MB/s  |
+| asyoulik.txt          | 77503       | 65941       | 5517 MB/s    | 3380 MB/s    | 12735 MB/s  | 14975 MB/s  |
+| lcet10.txt            | 234661      | 184939      | 6235 MB/s    | 3537 MB/s    | 14519 MB/s  | 16634 MB/s  |
+| plrabn12.txt          | 319267      | 264990      | 5159 MB/s    | 2960 MB/s    | 11923 MB/s  | 13382 MB/s  |
+| geo.protodata         | 23335       | 17689       | 21220 MB/s   | 10859 MB/s   | 56271 MB/s  | 57961 MB/s  |
+| kppkn.gtb             | 69526       | 55398       | 9732 MB/s    | 5206 MB/s    | 18491 MB/s  | 16524 MB/s  |
+| alice29.txt (128B)    | 80          | 78          | 6691 MB/s    | 7422 MB/s    | 31883 MB/s  | 34225 MB/s  |
+| alice29.txt (1000B)   | 774         | 746         | 12204 MB/s   | 5734 MB/s    | 48056 MB/s  | 42068 MB/s  |
+| alice29.txt (10000B)  | 6648        | 6218        | 10044 MB/s   | 6055 MB/s    | 32378 MB/s  | 28813 MB/s  |
+| alice29.txt (20000B)  | 12686       | 11492       | 7733 MB/s    | 3143 MB/s    | 30566 MB/s  | 27315 MB/s  |
+
+
+Except for the mostly incompressible JPEG image compression is better and usually in the 
+double digits in terms of percentage reduction over Snappy.
+
+The PDF sample shows a significant slowdown compared to Snappy, as this mode tries harder 
+to compress the data. Very small blocks are also not favorable for better compression, so throughput is way down.
+
+This mode aims to provide better compression at the expense of performance and achieves that 
+without a huge performance penalty, except on very small blocks. 
+
+Decompression speed suffers a little compared to the regular S2 mode, 
+but still manages to be close to Snappy in spite of increased compression.  
+ 
+# Best compression mode
+
+S2 offers a "best" compression mode. 
+
+This will compress as much as possible with little regard to CPU usage.
+
+Mainly for offline compression, but where decompression speed should still
+be high and compatible with other S2 compressed data.
+
+Some examples compared on 16 core CPU, amd64 assembly used:
+
+```
+* enwik10
+Default... 10000000000 -> 4759950115 [47.60%]; 1.03s, 9263.0MB/s
+Better...  10000000000 -> 4084706676 [40.85%]; 2.16s, 4415.4MB/s
+Best...    10000000000 -> 3615520079 [36.16%]; 42.259s, 225.7MB/s
+
+* github-june-2days-2019.json
+Default... 6273951764 -> 1041700255 [16.60%]; 431ms, 13882.3MB/s
+Better...  6273951764 -> 945841238 [15.08%]; 547ms, 10938.4MB/s
+Best...    6273951764 -> 826392576 [13.17%]; 9.455s, 632.8MB/s
+
+* nyc-taxi-data-10M.csv
+Default... 3325605752 -> 1093516949 [32.88%]; 324ms, 9788.7MB/s
+Better...  3325605752 -> 885394158 [26.62%]; 491ms, 6459.4MB/s
+Best...    3325605752 -> 773681257 [23.26%]; 8.29s, 412.0MB/s
+
+* 10gb.tar
+Default... 10065157632 -> 5915541066 [58.77%]; 1.028s, 9337.4MB/s
+Better...  10065157632 -> 5453844650 [54.19%]; 1.597s, 4862.7MB/s
+Best...    10065157632 -> 5192495021 [51.59%]; 32.78s, 308.2MB/
+
+* consensus.db.10gb
+Default... 10737418240 -> 4549762344 [42.37%]; 882ms, 12118.4MB/s
+Better...  10737418240 -> 4438535064 [41.34%]; 1.533s, 3500.9MB/s
+Best...    10737418240 -> 4210602774 [39.21%]; 42.96s, 254.4MB/s
+```
+
+Decompression speed should be around the same as using the 'better' compression mode. 
+
+## Dictionaries
+
+*Note: S2 dictionary compression is currently at an early implementation stage, with no assembly for
+neither encoding nor decoding. Performance improvements can be expected in the future.*
+
+Adding dictionaries allow providing a custom dictionary that will serve as lookup in the beginning of blocks.
+
+The same dictionary *must* be used for both encoding and decoding. 
+S2 does not keep track of whether the same dictionary is used,
+and using the wrong dictionary will most often not result in an error when decompressing.
+
+Blocks encoded *without* dictionaries can be decompressed seamlessly *with* a dictionary.
+This means it is possible to switch from an encoding without dictionaries to an encoding with dictionaries
+and treat the blocks similarly.
+
+Similar to [zStandard dictionaries](https://github.com/facebook/zstd#the-case-for-small-data-compression), 
+the same usage scenario applies to S2 dictionaries.  
+
+> Training works if there is some correlation in a family of small data samples. The more data-specific a dictionary is, the more efficient it is (there is no universal dictionary). Hence, deploying one dictionary per type of data will provide the greatest benefits. Dictionary gains are mostly effective in the first few KB. Then, the compression algorithm will gradually use previously decoded content to better compress the rest of the file.
+
+S2 further limits the dictionary to only be enabled on the first 64KB of a block.
+This will remove any negative (speed) impacts of the dictionaries on bigger blocks. 
+
+### Compression
+
+Using the [github_users_sample_set](https://github.com/facebook/zstd/releases/download/v1.1.3/github_users_sample_set.tar.zst) 
+and a 64KB dictionary trained with zStandard the following sizes can be achieved. 
+
+|                    | Default          | Better           | Best                  |
+|--------------------|------------------|------------------|-----------------------|
+| Without Dictionary | 3362023 (44.92%) | 3083163 (41.19%) | 3057944 (40.86%)      |
+| With Dictionary    | 921524 (12.31%)  | 873154 (11.67%)  | 785503 bytes (10.49%) |
+
+So for highly repetitive content, this case provides an almost 3x reduction in size.
+
+For less uniform data we will use the Go source code tree.
+Compressing First 64KB of all `.go` files in `go/src`, Go 1.19.5, 8912 files, 51253563 bytes input:
+
+|                    | Default           | Better            | Best              |
+|--------------------|-------------------|-------------------|-------------------|
+| Without Dictionary | 22955767 (44.79%) | 20189613 (39.39%  | 19482828 (38.01%) |
+| With Dictionary    | 19654568 (38.35%) | 16289357 (31.78%) | 15184589 (29.63%) |
+| Saving/file        | 362 bytes         | 428 bytes         | 472 bytes         |
+
+
+### Creating Dictionaries
+
+There are no tools to create dictionaries in S2. 
+However, there are multiple ways to create a useful dictionary:
+
+#### Using a Sample File
+
+If your input is very uniform, you can just use a sample file as the dictionary.
+
+For example in the `github_users_sample_set` above, the average compression only goes up from 
+10.49% to 11.48% by using the first file as dictionary compared to using a dedicated dictionary.
+
+```Go
+    // Read a sample
+    sample, err := os.ReadFile("sample.json")
+
+    // Create a dictionary.
+    dict := s2.MakeDict(sample, nil)
+	
+    // b := dict.Bytes() will provide a dictionary that can be saved
+    // and reloaded with s2.NewDict(b).
+	
+    // To encode:
+    encoded := dict.Encode(nil, file)
+
+    // To decode:
+    decoded, err := dict.Decode(nil, file)
+```
+
+#### Using Zstandard
+
+Zstandard dictionaries can easily be converted to S2 dictionaries.
+
+This can be helpful to generate dictionaries for files that don't have a fixed structure.
+
+
+Example, with training set files  placed in `./training-set`: 
+
+`λ zstd -r --train-fastcover training-set/* --maxdict=65536 -o name.dict`
+
+This will create a dictionary of 64KB, that can be converted to a dictionary like this:
+
+```Go
+    // Decode the Zstandard dictionary.
+    insp, err := zstd.InspectDictionary(zdict)
+    if err != nil {
+        panic(err)
+    }
+	
+    // We are only interested in the contents.
+    // Assume that files start with "// Copyright (c) 2023".
+    // Search for the longest match for that.
+    // This may save a few bytes.
+    dict := s2.MakeDict(insp.Content(), []byte("// Copyright (c) 2023"))
+
+    // b := dict.Bytes() will provide a dictionary that can be saved
+    // and reloaded with s2.NewDict(b).
+
+    // We can now encode using this dictionary
+    encodedWithDict := dict.Encode(nil, payload)
+
+    // To decode content:
+    decoded, err := dict.Decode(nil, encodedWithDict)
+```
+
+It is recommended to save the dictionary returned by ` b:= dict.Bytes()`, since that will contain only the S2 dictionary.
+
+This dictionary can later be loaded using `s2.NewDict(b)`. The dictionary then no longer requires `zstd` to be initialized.
+
+Also note how `s2.MakeDict` allows you to search for a common starting sequence of your files.
+This can be omitted, at the expense of a few bytes.
+
+# Snappy Compatibility
+
+S2 now offers full compatibility with Snappy.
+
+This means that the efficient encoders of S2 can be used to generate fully Snappy compatible output.
+
+There is a [snappy](https://github.com/klauspost/compress/tree/master/snappy) package that can be used by
+simply changing imports from `github.com/golang/snappy` to `github.com/klauspost/compress/snappy`.
+This uses "better" mode for all operations.
+If you would like more control, you can use the s2 package as described below: 
+
+## Blocks
+
+Snappy compatible blocks can be generated with the S2 encoder. 
+Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace 
+
+| Snappy                    | S2 replacement        |
+|---------------------------|-----------------------|
+| snappy.Encode(...)        | s2.EncodeSnappy(...)  |
+| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) |
+
+`s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output. 
+
+`s2.ConcatBlocks` is compatible with snappy blocks.
+
+Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
+53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used:
+
+| Encoder               | Size       | MB/s       | Reduction  |
+|-----------------------|------------|------------|------------|
+| snappy.Encode         | 1128706759 | 725.59     | 71.89%     |
+| s2.EncodeSnappy       | 1093823291 | **899.16** | 72.75%     |
+| s2.EncodeSnappyBetter | 1001158548 | 578.49     | 75.06%     |
+| s2.EncodeSnappyBest   | 944507998  | 66.00      | **76.47%** |
+
+## Streams
+
+For streams, replace `enc = snappy.NewBufferedWriter(w)` with `enc = s2.NewWriter(w, s2.WriterSnappyCompat())`.
+All other options are available, but note that block size limit is different for snappy.
+
+Comparison of different streams, AMD Ryzen 3950x, 16 cores. Size and throughput: 
+
+| File                        | snappy.NewWriter         | S2 Snappy                 | S2 Snappy, Better        | S2 Snappy, Best         |
+|-----------------------------|--------------------------|---------------------------|--------------------------|-------------------------|
+| nyc-taxi-data-10M.csv       | 1316042016 - 539.47MB/s  | 1307003093 - 10132.73MB/s | 1174534014 - 5002.44MB/s | 1115904679 - 177.97MB/s |
+| enwik10 (xml)               | 5088294643 - 451.13MB/s  | 5175840939 -  9440.69MB/s | 4560784526 - 4487.21MB/s | 4340299103 - 158.92MB/s |
+| 10gb.tar (mixed)            | 6056946612 - 729.73MB/s  | 6208571995 -  9978.05MB/s | 5741646126 - 4919.98MB/s | 5548973895 - 180.44MB/s |
+| github-june-2days-2019.json | 1525176492 - 933.00MB/s  | 1476519054 - 13150.12MB/s | 1400547532 - 5803.40MB/s | 1321887137 - 204.29MB/s |
+| consensus.db.10gb (db)      | 5412897703 - 1102.14MB/s | 5354073487 - 13562.91MB/s | 5335069899 - 5294.73MB/s | 5201000954 - 175.72MB/s |
+
+# Decompression
+
+All decompression functions map directly to equivalent s2 functions.
+
+| Snappy                 | S2 replacement     |
+|------------------------|--------------------|
+| snappy.Decode(...)     | s2.Decode(...)     |
+| snappy.DecodedLen(...) | s2.DecodedLen(...) |
+| snappy.NewReader(...)  | s2.NewReader(...)  |
+
+Features like [quick forward skipping without decompression](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.Skip)
+are also available for Snappy streams.
+
+If you know you are only decompressing snappy streams, setting [`ReaderMaxBlockSize(64<<10)`](https://pkg.go.dev/github.com/klauspost/compress/s2#ReaderMaxBlockSize)
+on your Reader will reduce memory consumption.
+
+# Concatenating blocks and streams.
+
+Concatenating streams will concatenate the output of both without recompressing them. 
+While this is inefficient in terms of compression it might be usable in certain scenarios. 
+The 10 byte 'stream identifier' of the second stream can optionally be stripped, but it is not a requirement.
+
+Blocks can be concatenated using the `ConcatBlocks` function.
+
+Snappy blocks/streams can safely be concatenated with S2 blocks and streams.
+Streams with indexes (see below) will currently not work on concatenated streams.
+
+# Stream Seek Index
+
+S2 and Snappy streams can have indexes. These indexes will allow random seeking within the compressed data.
+
+The index can either be appended to the stream as a skippable block or returned for separate storage.
+
+When the index is appended to a stream it will be skipped by regular decoders, 
+so the output remains compatible with other decoders. 
+
+## Creating an Index
+
+To automatically add an index to a stream, add `WriterAddIndex()` option to your writer.
+Then the index will be added to the stream when `Close()` is called.
+
+```
+	// Add Index to stream...
+	enc := s2.NewWriter(w, s2.WriterAddIndex())
+	io.Copy(enc, r)
+	enc.Close()
+```
+
+If you want to store the index separately, you can use `CloseIndex()` instead of the regular `Close()`.
+This will return the index. Note that `CloseIndex()` should only be called once, and you shouldn't call `Close()`.
+
+```
+	// Get index for separate storage... 
+	enc := s2.NewWriter(w)
+	io.Copy(enc, r)
+	index, err := enc.CloseIndex()
+```
+
+The `index` can then be used needing to read from the stream. 
+This means the index can be used without needing to seek to the end of the stream 
+or for manually forwarding streams. See below.
+
+Finally, an existing S2/Snappy stream can be indexed using the `s2.IndexStream(r io.Reader)` function.
+
+## Using Indexes
+
+To use indexes there is a `ReadSeeker(random bool, index []byte) (*ReadSeeker, error)` function available.
+
+Calling ReadSeeker will return an [io.ReadSeeker](https://pkg.go.dev/io#ReadSeeker) compatible version of the reader.
+
+If 'random' is specified the returned io.Seeker can be used for random seeking, otherwise only forward seeking is supported.
+Enabling random seeking requires the original input to support the [io.Seeker](https://pkg.go.dev/io#Seeker) interface.
+
+```
+	dec := s2.NewReader(r)
+	rs, err := dec.ReadSeeker(false, nil)
+	rs.Seek(wantOffset, io.SeekStart)	
+```
+
+Get a seeker to seek forward. Since no index is provided, the index is read from the stream.
+This requires that an index was added and that `r` supports the [io.Seeker](https://pkg.go.dev/io#Seeker) interface.
+
+A custom index can be specified which will be used if supplied.
+When using a custom index, it will not be read from the input stream.
+
+```
+	dec := s2.NewReader(r)
+	rs, err := dec.ReadSeeker(false, index)
+	rs.Seek(wantOffset, io.SeekStart)	
+```
+
+This will read the index from `index`. Since we specify non-random (forward only) seeking `r` does not have to be an io.Seeker
+
+```
+	dec := s2.NewReader(r)
+	rs, err := dec.ReadSeeker(true, index)
+	rs.Seek(wantOffset, io.SeekStart)	
+```
+
+Finally, since we specify that we want to do random seeking `r` must be an io.Seeker. 
+
+The returned [ReadSeeker](https://pkg.go.dev/github.com/klauspost/compress/s2#ReadSeeker) contains a shallow reference to the existing Reader,
+meaning changes performed to one is reflected in the other.
+
+To check if a stream contains an index at the end, the `(*Index).LoadStream(rs io.ReadSeeker) error` can be used.
+
+## Manually Forwarding Streams
+
+Indexes can also be read outside the decoder using the [Index](https://pkg.go.dev/github.com/klauspost/compress/s2#Index) type.
+This can be used for parsing indexes, either separate or in streams.
+
+In some cases it may not be possible to serve a seekable stream.
+This can for instance be an HTTP stream, where the Range request 
+is sent at the start of the stream. 
+
+With a little bit of extra code it is still possible to use indexes
+to forward to specific offset with a single forward skip. 
+
+It is possible to load the index manually like this: 
+```
+	var index s2.Index
+	_, err = index.Load(idxBytes)
+```
+
+This can be used to figure out how much to offset the compressed stream:
+
+```
+	compressedOffset, uncompressedOffset, err := index.Find(wantOffset)
+```
+
+The `compressedOffset` is the number of bytes that should be skipped 
+from the beginning of the compressed file.
+
+The `uncompressedOffset` will then be offset of the uncompressed bytes returned
+when decoding from that position. This will always be <= wantOffset.
+
+When creating a decoder it must be specified that it should *not* expect a stream identifier
+at the beginning of the stream. Assuming the io.Reader `r` has been forwarded to `compressedOffset`
+we create the decoder like this:
+
+```
+	dec := s2.NewReader(r, s2.ReaderIgnoreStreamIdentifier())
+```
+
+We are not completely done. We still need to forward the stream the uncompressed bytes we didn't want.
+This is done using the regular "Skip" function:
+
+```
+	err = dec.Skip(wantOffset - uncompressedOffset)
+```
+
+This will ensure that we are at exactly the offset we want, and reading from `dec` will start at the requested offset.
+
+# Compact storage
+
+For compact storage [RemoveIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RemoveIndexHeaders) can be used to remove any redundant info from 
+a serialized index. If you remove the header it must be restored before [Loading](https://pkg.go.dev/github.com/klauspost/compress/s2#Index.Load).
+
+This is expected to save 20 bytes. These can be restored using [RestoreIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RestoreIndexHeaders). This removes a layer of security, but is the most compact representation. Returns nil if headers contains errors.
+
+## Index Format:
+
+Each block is structured as a snappy skippable block, with the chunk ID 0x99.
+
+The block can be read from the front, but contains information so it can be read from the back as well.
+
+Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding), 
+with un-encoded value length of 64 bits, unless other limits are specified. 
+
+| Content                              | Format                                                                                                                        |
+|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
+| ID, `[1]byte`                        | Always 0x99.                                                                                                                  |
+| Data Length, `[3]byte`               | 3 byte little-endian length of the chunk in bytes, following this.                                                            |
+| Header `[6]byte`                     | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00".                                                        |
+| UncompressedSize, Varint             | Total Uncompressed size.                                                                                                      |
+| CompressedSize, Varint               | Total Compressed size if known. Should be -1 if unknown.                                                                      |
+| EstBlockSize, Varint                 | Block Size, used for guessing uncompressed offsets. Must be >= 0.                                                             |
+| Entries, Varint                      | Number of Entries in index, must be < 65536 and >=0.                                                                          |
+| HasUncompressedOffsets `byte`        | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid.                                             |
+| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode.                                                                                |
+| CompressedOffsets, [Entries]VarInt   | Compressed offsets. See below how to decode.                                                                                  |
+| Block Size, `[4]byte`                | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block.       |
+| Trailer `[6]byte`                    | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
+
+For regular streams the uncompressed offsets are fully predictable,
+so `HasUncompressedOffsets` allows to specify that compressed blocks all have 
+exactly `EstBlockSize` bytes of uncompressed content.
+
+Entries *must* be in order, starting with the lowest offset, 
+and there *must* be no uncompressed offset duplicates.  
+Entries *may* point to the start of a skippable block, 
+but it is then not allowed to also have an entry for the next block since 
+that would give an uncompressed offset duplicate.
+
+There is no requirement for all blocks to be represented in the index. 
+In fact there is a maximum of 65536 block entries in an index.
+
+The writer can use any method to reduce the number of entries.
+An implicit block start at 0,0 can be assumed.
+
+### Decoding entries:
+
+```
+// Read Uncompressed entries.
+// Each assumes EstBlockSize delta from previous.
+for each entry {
+    uOff = 0
+    if HasUncompressedOffsets == 1 {
+        uOff = ReadVarInt // Read value from stream
+    }
+   
+    // Except for the first entry, use previous values.
+    if entryNum == 0 {
+        entry[entryNum].UncompressedOffset = uOff
+        continue
+    }
+    
+    // Uncompressed uses previous offset and adds EstBlockSize
+    entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + uOff
+}
+
+
+// Guess that the first block will be 50% of uncompressed size.
+// Integer truncating division must be used.
+CompressGuess := EstBlockSize / 2
+
+// Read Compressed entries.
+// Each assumes CompressGuess delta from previous.
+// CompressGuess is adjusted for each value.
+for each entry {
+    cOff = ReadVarInt // Read value from stream
+    
+    // Except for the first entry, use previous values.
+    if entryNum == 0 {
+        entry[entryNum].CompressedOffset = cOff
+        continue
+    }
+    
+    // Compressed uses previous and our estimate.
+    entry[entryNum].CompressedOffset = entry[entryNum-1].CompressedOffset + CompressGuess + cOff
+        
+     // Adjust compressed offset for next loop, integer truncating division must be used. 
+     CompressGuess += cOff/2               
+}
+```
+
+To decode from any given uncompressed offset `(wantOffset)`:
+
+* Iterate entries until `entry[n].UncompressedOffset > wantOffset`.
+* Start decoding from `entry[n-1].CompressedOffset`.
+* Discard `entry[n-1].UncompressedOffset - wantOffset` bytes from the decoded stream.
+
+See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface.
+
+
+# Format Extensions
+
+* Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`.
+* [Framed compressed blocks](https://github.com/google/snappy/blob/master/format_description.txt) can be up to 4MB (up from 64KB).
+* Compressed blocks can have an offset of `0`, which indicates to repeat the last seen offset.
+
+Repeat offsets must be encoded as a [2.2.1. Copy with 1-byte offset (01)](https://github.com/google/snappy/blob/master/format_description.txt#L89), where the offset is 0.
+
+The length is specified by reading the 3-bit length specified in the tag and decode using this table:
+
+| Length | Actual Length        |
+|--------|----------------------|
+| 0      | 4                    |
+| 1      | 5                    |
+| 2      | 6                    |
+| 3      | 7                    |
+| 4      | 8                    |
+| 5      | 8 + read 1 byte      |
+| 6      | 260 + read 2 bytes   |
+| 7      | 65540 + read 3 bytes |
+
+This allows any repeat offset + length to be represented by 2 to 5 bytes.
+It also allows to emit matches longer than 64 bytes with one copy + one repeat instead of several 64 byte copies.
+
+Lengths are stored as little endian values.
+
+The first copy of a block cannot be a repeat offset and the offset is reset on every block in streams.
+
+Default streaming block size is 1MB.
+
+# Dictionary Encoding
+
+Adding dictionaries allow providing a custom dictionary that will serve as lookup in the beginning of blocks.
+
+A dictionary provides an initial repeat value that can be used to point to a common header.
+
+Other than that the dictionary contains values that can be used as back-references.
+
+Often used data should be placed at the *end* of the dictionary since offsets < 2048 bytes will be smaller.
+
+## Format
+
+Dictionary *content* must at least 16 bytes and less or equal to 64KiB (65536 bytes).
+
+Encoding: `[repeat value (uvarint)][dictionary content...]`
+
+Before the dictionary content, an unsigned base-128 (uvarint) encoded value specifying the initial repeat offset.
+This value is an offset into the dictionary content and not a back-reference offset,
+so setting this to 0 will make the repeat value point to the first value of the dictionary.
+
+The value must be less than the dictionary length-8
+
+## Encoding
+
+From the decoder point of view the dictionary content is seen as preceding the encoded content.
+
+`[dictionary content][decoded output]`
+
+Backreferences to the dictionary are encoded as ordinary backreferences that have an offset before the start of the decoded block.
+
+Matches copying from the dictionary are **not** allowed to cross from the dictionary into the decoded data.
+However, if a copy ends at the end of the dictionary the next repeat will point to the start of the decoded buffer, which is allowed.
+
+The first match can be a repeat value, which will use the repeat offset stored in the dictionary.
+
+When 64KB (65536 bytes) has been en/decoded it is no longer allowed to reference the dictionary, 
+neither by a copy nor repeat operations. 
+If the boundary is crossed while copying from the dictionary, the operation should complete, 
+but the next instruction is not allowed to reference the dictionary.
+
+Valid blocks encoded *without* a dictionary can be decoded with any dictionary. 
+There are no checks whether the supplied dictionary is the correct for a block.
+Because of this there is no overhead by using a dictionary.
+
+## Example
+
+This is the dictionary content. Elements are separated by `[]`.
+
+Dictionary: `[0x0a][Yesterday 25 bananas were added to Benjamins brown bag]`.
+
+Initial repeat offset is set at 10, which is the letter `2`.
+
+Encoded `[LIT "10"][REPEAT len=10][LIT "hich"][MATCH off=50 len=6][MATCH off=31 len=6][MATCH off=61 len=10]`
+
+Decoded: `[10][ bananas w][hich][ were ][brown ][were added]`
+
+Output: `10 bananas which were brown were added`
+
+
+## Streams
+
+For streams each block can use the dictionary.
+
+The dictionary cannot not currently be provided on the stream.
+
+
+# LICENSE
+
+This code is based on the [Snappy-Go](https://github.com/golang/snappy) implementation.
+
+Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go
new file mode 100644
index 0000000000..264ffd0a9b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode.go
@@ -0,0 +1,443 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"strconv"
+
+	"github.com/klauspost/compress/internal/race"
+)
+
+var (
+	// ErrCorrupt reports that the input is invalid.
+	ErrCorrupt = errors.New("s2: corrupt input")
+	// ErrCRC reports that the input failed CRC validation (streams only)
+	ErrCRC = errors.New("s2: corrupt input, crc mismatch")
+	// ErrTooLarge reports that the uncompressed length is too large.
+	ErrTooLarge = errors.New("s2: decoded block is too large")
+	// ErrUnsupported reports that the input isn't supported.
+	ErrUnsupported = errors.New("s2: unsupported input")
+)
+
+// DecodedLen returns the length of the decoded block.
+func DecodedLen(src []byte) (int, error) {
+	v, _, err := decodedLen(src)
+	return v, err
+}
+
+// decodedLen returns the length of the decoded block and the number of bytes
+// that the length header occupied.
+func decodedLen(src []byte) (blockLen, headerLen int, err error) {
+	v, n := binary.Uvarint(src)
+	if n <= 0 || v > 0xffffffff {
+		return 0, 0, ErrCorrupt
+	}
+
+	const wordSize = 32 << (^uint(0) >> 32 & 1)
+	if wordSize == 32 && v > 0x7fffffff {
+		return 0, 0, ErrTooLarge
+	}
+	return int(v), n, nil
+}
+
+const (
+	decodeErrCodeCorrupt = 1
+)
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func Decode(dst, src []byte) ([]byte, error) {
+	dLen, s, err := decodedLen(src)
+	if err != nil {
+		return nil, err
+	}
+	if dLen <= cap(dst) {
+		dst = dst[:dLen]
+	} else {
+		dst = make([]byte, dLen)
+	}
+
+	race.WriteSlice(dst)
+	race.ReadSlice(src[s:])
+
+	if s2Decode(dst, src[s:]) != 0 {
+		return nil, ErrCorrupt
+	}
+	return dst, nil
+}
+
+// s2DecodeDict writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func s2DecodeDict(dst, src []byte, dict *Dict) int {
+	if dict == nil {
+		return s2Decode(dst, src)
+	}
+	const debug = false
+	const debugErrs = debug
+
+	if debug {
+		fmt.Println("Starting decode, dst len:", len(dst))
+	}
+	var d, s, length int
+	offset := len(dict.dict) - dict.repeat
+
+	// As long as we can read at least 5 bytes...
+	for s < len(src)-5 {
+		// Removing bounds checks is SLOWER, when if doing
+		// in := src[s:s+5]
+		// Checked on Go 1.18
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				x = uint32(src[s-1])
+			case x == 61:
+				in := src[s : s+3]
+				x = uint32(in[1]) | uint32(in[2])<<8
+				s += 3
+			case x == 62:
+				in := src[s : s+4]
+				// Load as 32 bit and shift down.
+				x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
+				x >>= 8
+				s += 4
+			case x == 63:
+				in := src[s : s+5]
+				x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
+				s += 5
+			}
+			length = int(x) + 1
+			if debug {
+				fmt.Println("literals, length:", length, "d-after:", d+length)
+			}
+			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+				if debugErrs {
+					fmt.Println("corrupt literal: length:", length, "d-left:", len(dst)-d, "src-left:", len(src)-s)
+				}
+				return decodeErrCodeCorrupt
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			length = int(src[s-2]) >> 2 & 0x7
+			if toffset == 0 {
+				if debug {
+					fmt.Print("(repeat) ")
+				}
+				// keep last offset
+				switch length {
+				case 5:
+					length = int(src[s]) + 4
+					s += 1
+				case 6:
+					in := src[s : s+2]
+					length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
+					s += 2
+				case 7:
+					in := src[s : s+3]
+					length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16)
+					s += 3
+				default: // 0-> 4
+				}
+			} else {
+				offset = toffset
+			}
+			length += 4
+		case tagCopy2:
+			in := src[s : s+3]
+			offset = int(uint32(in[1]) | uint32(in[2])<<8)
+			length = 1 + int(in[0])>>2
+			s += 3
+
+		case tagCopy4:
+			in := src[s : s+5]
+			offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
+			length = 1 + int(in[0])>>2
+			s += 5
+		}
+
+		if offset <= 0 || length > len(dst)-d {
+			if debugErrs {
+				fmt.Println("match error; offset:", offset, "length:", length, "dst-left:", len(dst)-d)
+			}
+			return decodeErrCodeCorrupt
+		}
+
+		// copy from dict
+		if d < offset {
+			if d > MaxDictSrcOffset {
+				if debugErrs {
+					fmt.Println("dict after", MaxDictSrcOffset, "d:", d, "offset:", offset, "length:", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			startOff := len(dict.dict) - offset + d
+			if startOff < 0 || startOff+length > len(dict.dict) {
+				if debugErrs {
+					fmt.Printf("offset (%d) + length (%d) bigger than dict (%d)\n", offset, length, len(dict.dict))
+				}
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Println("dict copy, length:", length, "offset:", offset, "d-after:", d+length, "dict start offset:", startOff)
+			}
+			copy(dst[d:d+length], dict.dict[startOff:])
+			d += length
+			continue
+		}
+
+		if debug {
+			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	// Remaining with extra checks...
+	for s < len(src) {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					if debugErrs {
+						fmt.Println("src went oob")
+					}
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					if debugErrs {
+						fmt.Println("src went oob")
+					}
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					if debugErrs {
+						fmt.Println("src went oob")
+					}
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					if debugErrs {
+						fmt.Println("src went oob")
+					}
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+				if debugErrs {
+					fmt.Println("corrupt literal: length:", length, "d-left:", len(dst)-d, "src-left:", len(src)-s)
+				}
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Println("literals, length:", length, "d-after:", d+length)
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				if debugErrs {
+					fmt.Println("src went oob")
+				}
+				return decodeErrCodeCorrupt
+			}
+			length = int(src[s-2]) >> 2 & 0x7
+			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			if toffset == 0 {
+				if debug {
+					fmt.Print("(repeat) ")
+				}
+				// keep last offset
+				switch length {
+				case 5:
+					s += 1
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						if debugErrs {
+							fmt.Println("src went oob")
+						}
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-1])) + 4
+				case 6:
+					s += 2
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						if debugErrs {
+							fmt.Println("src went oob")
+						}
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
+				case 7:
+					s += 3
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						if debugErrs {
+							fmt.Println("src went oob")
+						}
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
+				default: // 0-> 4
+				}
+			} else {
+				offset = toffset
+			}
+			length += 4
+		case tagCopy2:
+			s += 3
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				if debugErrs {
+					fmt.Println("src went oob")
+				}
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-3])>>2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+		case tagCopy4:
+			s += 5
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				if debugErrs {
+					fmt.Println("src went oob")
+				}
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+		}
+
+		if offset <= 0 || length > len(dst)-d {
+			if debugErrs {
+				fmt.Println("match error; offset:", offset, "length:", length, "dst-left:", len(dst)-d)
+			}
+			return decodeErrCodeCorrupt
+		}
+
+		// copy from dict
+		if d < offset {
+			if d > MaxDictSrcOffset {
+				if debugErrs {
+					fmt.Println("dict after", MaxDictSrcOffset, "d:", d, "offset:", offset, "length:", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			rOff := len(dict.dict) - (offset - d)
+			if debug {
+				fmt.Println("starting dict entry from dict offset", len(dict.dict)-rOff)
+			}
+			if rOff+length > len(dict.dict) {
+				if debugErrs {
+					fmt.Println("err: END offset", rOff+length, "bigger than dict", len(dict.dict), "dict offset:", rOff, "length:", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			if rOff < 0 {
+				if debugErrs {
+					fmt.Println("err: START offset", rOff, "less than 0", len(dict.dict), "dict offset:", rOff, "length:", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			copy(dst[d:d+length], dict.dict[rOff:])
+			d += length
+			continue
+		}
+
+		if debug {
+			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	if d != len(dst) {
+		if debugErrs {
+			fmt.Println("wanted length", len(dst), "got", d)
+		}
+		return decodeErrCodeCorrupt
+	}
+	return 0
+}
diff --git a/vendor/github.com/golang/snappy/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
similarity index 60%
rename from vendor/github.com/golang/snappy/decode_amd64.s
rename to vendor/github.com/klauspost/compress/s2/decode_amd64.s
index e6179f65e3..9b105e03c5 100644
--- a/vendor/github.com/golang/snappy/decode_amd64.s
+++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
@@ -1,4 +1,5 @@
 // Copyright 2016 The Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -8,6 +9,21 @@
 
 #include "textflag.h"
 
+#define R_TMP0 AX
+#define R_TMP1 BX
+#define R_LEN CX
+#define R_OFF DX
+#define R_SRC SI
+#define R_DST DI
+#define R_DBASE R8
+#define R_DLEN R9
+#define R_DEND R10
+#define R_SBASE R11
+#define R_SLEN R12
+#define R_SEND R13
+#define R_TMP2 R14
+#define R_TMP3 R15
+
 // The asm code generally follows the pure Go code in decode_other.go, except
 // where marked with a "!!!".
 
@@ -15,51 +31,52 @@
 //
 // All local variables fit into registers. The non-zero stack size is only to
 // spill registers and push args when issuing a CALL. The register allocation:
-//	- AX	scratch
-//	- BX	scratch
-//	- CX	length or x
-//	- DX	offset
-//	- SI	&src[s]
-//	- DI	&dst[d]
-//	+ R8	dst_base
-//	+ R9	dst_len
-//	+ R10	dst_base + dst_len
-//	+ R11	src_base
-//	+ R12	src_len
-//	+ R13	src_base + src_len
-//	- R14	used by doCopy
-//	- R15	used by doCopy
+//	- R_TMP0	scratch
+//	- R_TMP1	scratch
+//	- R_LEN	    length or x (shared)
+//	- R_OFF	    offset
+//	- R_SRC	    &src[s]
+//	- R_DST	    &dst[d]
+//	+ R_DBASE	dst_base
+//	+ R_DLEN	dst_len
+//	+ R_DEND	dst_base + dst_len
+//	+ R_SBASE	src_base
+//	+ R_SLEN	src_len
+//	+ R_SEND	src_base + src_len
+//	- R_TMP2	used by doCopy
+//	- R_TMP3	used by doCopy
 //
-// The registers R8-R13 (marked with a "+") are set at the start of the
+// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
 // function, and after a CALL returns, and are not otherwise modified.
 //
-// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
-// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
-TEXT ·decode(SB), NOSPLIT, $48-56
-	// Initialize SI, DI and R8-R13.
-	MOVQ dst_base+0(FP), R8
-	MOVQ dst_len+8(FP), R9
-	MOVQ R8, DI
-	MOVQ R8, R10
-	ADDQ R9, R10
-	MOVQ src_base+24(FP), R11
-	MOVQ src_len+32(FP), R12
-	MOVQ R11, SI
-	MOVQ R11, R13
-	ADDQ R12, R13
+// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
+// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
+TEXT ·s2Decode(SB), NOSPLIT, $48-56
+	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
+	MOVQ dst_base+0(FP), R_DBASE
+	MOVQ dst_len+8(FP), R_DLEN
+	MOVQ R_DBASE, R_DST
+	MOVQ R_DBASE, R_DEND
+	ADDQ R_DLEN, R_DEND
+	MOVQ src_base+24(FP), R_SBASE
+	MOVQ src_len+32(FP), R_SLEN
+	MOVQ R_SBASE, R_SRC
+	MOVQ R_SBASE, R_SEND
+	ADDQ R_SLEN, R_SEND
+	XORQ R_OFF, R_OFF
 
 loop:
 	// for s < len(src)
-	CMPQ SI, R13
+	CMPQ R_SRC, R_SEND
 	JEQ  end
 
-	// CX = uint32(src[s])
+	// R_LEN = uint32(src[s])
 	//
 	// switch src[s] & 0x03
-	MOVBLZX (SI), CX
-	MOVL    CX, BX
-	ANDL    $3, BX
-	CMPL    BX, $1
+	MOVBLZX (R_SRC), R_LEN
+	MOVL    R_LEN, R_TMP1
+	ANDL    $3, R_TMP1
+	CMPL    R_TMP1, $1
 	JAE     tagCopy
 
 	// ----------------------------------------
@@ -68,35 +85,35 @@ loop:
 	// case tagLiteral:
 	// x := uint32(src[s] >> 2)
 	// switch
-	SHRL $2, CX
-	CMPL CX, $60
+	SHRL $2, R_LEN
+	CMPL R_LEN, $60
 	JAE  tagLit60Plus
 
 	// case x < 60:
 	// s++
-	INCQ SI
+	INCQ R_SRC
 
 doLit:
 	// This is the end of the inner "switch", when we have a literal tag.
 	//
-	// We assume that CX == x and x fits in a uint32, where x is the variable
+	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
 	// used in the pure Go decode_other.go code.
 
 	// length = int(x) + 1
 	//
 	// Unlike the pure Go code, we don't need to check if length <= 0 because
-	// CX can hold 64 bits, so the increment cannot overflow.
-	INCQ CX
+	// R_LEN can hold 64 bits, so the increment cannot overflow.
+	INCQ R_LEN
 
 	// Prepare to check if copying length bytes will run past the end of dst or
 	// src.
 	//
-	// AX = len(dst) - d
-	// BX = len(src) - s
-	MOVQ R10, AX
-	SUBQ DI, AX
-	MOVQ R13, BX
-	SUBQ SI, BX
+	// R_TMP0 = len(dst) - d
+	// R_TMP1 = len(src) - s
+	MOVQ R_DEND, R_TMP0
+	SUBQ R_DST, R_TMP0
+	MOVQ R_SEND, R_TMP1
+	SUBQ R_SRC, R_TMP1
 
 	// !!! Try a faster technique for short (16 or fewer bytes) copies.
 	//
@@ -109,11 +126,11 @@ doLit:
 	// is contiguous in memory and so it needs to leave enough source bytes to
 	// read the next tag without refilling buffers, but Go's Decode assumes
 	// contiguousness (the src argument is a []byte).
-	CMPQ CX, $16
+	CMPQ R_LEN, $16
 	JGT  callMemmove
-	CMPQ AX, $16
+	CMPQ R_TMP0, $16
 	JLT  callMemmove
-	CMPQ BX, $16
+	CMPQ R_TMP1, $16
 	JLT  callMemmove
 
 	// !!! Implement the copy from src to dst as a 16-byte load and store.
@@ -127,53 +144,55 @@ doLit:
 	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
 	// 16-byte loads and stores. This technique probably wouldn't be as
 	// effective on architectures that are fussier about alignment.
-	MOVOU 0(SI), X0
-	MOVOU X0, 0(DI)
+	MOVOU 0(R_SRC), X0
+	MOVOU X0, 0(R_DST)
 
 	// d += length
 	// s += length
-	ADDQ CX, DI
-	ADDQ CX, SI
+	ADDQ R_LEN, R_DST
+	ADDQ R_LEN, R_SRC
 	JMP  loop
 
 callMemmove:
 	// if length > len(dst)-d || length > len(src)-s { etc }
-	CMPQ CX, AX
+	CMPQ R_LEN, R_TMP0
 	JGT  errCorrupt
-	CMPQ CX, BX
+	CMPQ R_LEN, R_TMP1
 	JGT  errCorrupt
 
 	// copy(dst[d:], src[s:s+length])
 	//
 	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
-	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
+	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
 	// three registers to the stack, to save local variables across the CALL.
-	MOVQ DI, 0(SP)
-	MOVQ SI, 8(SP)
-	MOVQ CX, 16(SP)
-	MOVQ DI, 24(SP)
-	MOVQ SI, 32(SP)
-	MOVQ CX, 40(SP)
+	MOVQ R_DST, 0(SP)
+	MOVQ R_SRC, 8(SP)
+	MOVQ R_LEN, 16(SP)
+	MOVQ R_DST, 24(SP)
+	MOVQ R_SRC, 32(SP)
+	MOVQ R_LEN, 40(SP)
+	MOVQ R_OFF, 48(SP)
 	CALL runtime·memmove(SB)
 
 	// Restore local variables: unspill registers from the stack and
-	// re-calculate R8-R13.
-	MOVQ 24(SP), DI
-	MOVQ 32(SP), SI
-	MOVQ 40(SP), CX
-	MOVQ dst_base+0(FP), R8
-	MOVQ dst_len+8(FP), R9
-	MOVQ R8, R10
-	ADDQ R9, R10
-	MOVQ src_base+24(FP), R11
-	MOVQ src_len+32(FP), R12
-	MOVQ R11, R13
-	ADDQ R12, R13
+	// re-calculate R_DBASE-R_SEND.
+	MOVQ 24(SP), R_DST
+	MOVQ 32(SP), R_SRC
+	MOVQ 40(SP), R_LEN
+	MOVQ 48(SP), R_OFF
+	MOVQ dst_base+0(FP), R_DBASE
+	MOVQ dst_len+8(FP), R_DLEN
+	MOVQ R_DBASE, R_DEND
+	ADDQ R_DLEN, R_DEND
+	MOVQ src_base+24(FP), R_SBASE
+	MOVQ src_len+32(FP), R_SLEN
+	MOVQ R_SBASE, R_SEND
+	ADDQ R_SLEN, R_SEND
 
 	// d += length
 	// s += length
-	ADDQ CX, DI
-	ADDQ CX, SI
+	ADDQ R_LEN, R_DST
+	ADDQ R_LEN, R_SRC
 	JMP  loop
 
 tagLit60Plus:
@@ -182,44 +201,42 @@ tagLit60Plus:
 	// s += x - 58; if uint(s) > uint(len(src)) { etc }
 	//
 	// checks. In the asm version, we code it once instead of once per switch case.
-	ADDQ CX, SI
-	SUBQ $58, SI
-	MOVQ SI, BX
-	SUBQ R11, BX
-	CMPQ BX, R12
+	ADDQ R_LEN, R_SRC
+	SUBQ $58, R_SRC
+	CMPQ R_SRC, R_SEND
 	JA   errCorrupt
 
 	// case x == 60:
-	CMPL CX, $61
+	CMPL R_LEN, $61
 	JEQ  tagLit61
 	JA   tagLit62Plus
 
 	// x = uint32(src[s-1])
-	MOVBLZX -1(SI), CX
+	MOVBLZX -1(R_SRC), R_LEN
 	JMP     doLit
 
 tagLit61:
 	// case x == 61:
 	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
-	MOVWLZX -2(SI), CX
+	MOVWLZX -2(R_SRC), R_LEN
 	JMP     doLit
 
 tagLit62Plus:
-	CMPL CX, $62
+	CMPL R_LEN, $62
 	JA   tagLit63
 
 	// case x == 62:
 	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
-	MOVWLZX -3(SI), CX
-	MOVBLZX -1(SI), BX
-	SHLL    $16, BX
-	ORL     BX, CX
-	JMP     doLit
+	// We read one byte, safe to read one back, since we are just reading tag.
+	// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
+	MOVL -4(R_SRC), R_LEN
+	SHRL $8, R_LEN
+	JMP  doLit
 
 tagLit63:
 	// case x == 63:
 	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-	MOVL -4(SI), CX
+	MOVL -4(R_SRC), R_LEN
 	JMP  doLit
 
 // The code above handles literal tags.
@@ -229,103 +246,161 @@ tagLit63:
 tagCopy4:
 	// case tagCopy4:
 	// s += 5
-	ADDQ $5, SI
+	ADDQ $5, R_SRC
 
 	// if uint(s) > uint(len(src)) { etc }
-	MOVQ SI, BX
-	SUBQ R11, BX
-	CMPQ BX, R12
+	CMPQ R_SRC, R_SEND
 	JA   errCorrupt
 
 	// length = 1 + int(src[s-5])>>2
-	SHRQ $2, CX
-	INCQ CX
+	SHRQ $2, R_LEN
+	INCQ R_LEN
 
 	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
-	MOVLQZX -4(SI), DX
+	MOVLQZX -4(R_SRC), R_OFF
 	JMP     doCopy
 
 tagCopy2:
 	// case tagCopy2:
 	// s += 3
-	ADDQ $3, SI
+	ADDQ $3, R_SRC
 
 	// if uint(s) > uint(len(src)) { etc }
-	MOVQ SI, BX
-	SUBQ R11, BX
-	CMPQ BX, R12
+	CMPQ R_SRC, R_SEND
 	JA   errCorrupt
 
 	// length = 1 + int(src[s-3])>>2
-	SHRQ $2, CX
-	INCQ CX
+	SHRQ $2, R_LEN
+	INCQ R_LEN
 
 	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
-	MOVWQZX -2(SI), DX
+	MOVWQZX -2(R_SRC), R_OFF
 	JMP     doCopy
 
 tagCopy:
 	// We have a copy tag. We assume that:
-	//	- BX == src[s] & 0x03
-	//	- CX == src[s]
-	CMPQ BX, $2
+	//	- R_TMP1 == src[s] & 0x03
+	//	- R_LEN == src[s]
+	CMPQ R_TMP1, $2
 	JEQ  tagCopy2
 	JA   tagCopy4
 
 	// case tagCopy1:
 	// s += 2
-	ADDQ $2, SI
+	ADDQ $2, R_SRC
 
 	// if uint(s) > uint(len(src)) { etc }
-	MOVQ SI, BX
-	SUBQ R11, BX
-	CMPQ BX, R12
+	CMPQ R_SRC, R_SEND
 	JA   errCorrupt
 
 	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
-	MOVQ    CX, DX
-	ANDQ    $0xe0, DX
-	SHLQ    $3, DX
-	MOVBQZX -1(SI), BX
-	ORQ     BX, DX
-
 	// length = 4 + int(src[s-2])>>2&0x7
-	SHRQ $2, CX
-	ANDQ $7, CX
-	ADDQ $4, CX
+	MOVBQZX -1(R_SRC), R_TMP1
+	MOVQ    R_LEN, R_TMP0
+	SHRQ    $2, R_LEN
+	ANDQ    $0xe0, R_TMP0
+	ANDQ    $7, R_LEN
+	SHLQ    $3, R_TMP0
+	ADDQ    $4, R_LEN
+	ORQ     R_TMP1, R_TMP0
+
+	// check if repeat code, ZF set by ORQ.
+	JZ repeatCode
+
+	// This is a regular copy, transfer our temporary value to R_OFF (length)
+	MOVQ R_TMP0, R_OFF
+	JMP  doCopy
+
+// This is a repeat code.
+repeatCode:
+	// If length < 9, reuse last offset, with the length already calculated.
+	CMPQ R_LEN, $9
+	JL   doCopyRepeat
+
+	// Read additional bytes for length.
+	JE repeatLen1
+
+	// Rare, so the extra branch shouldn't hurt too much.
+	CMPQ R_LEN, $10
+	JE   repeatLen2
+	JMP  repeatLen3
+
+// Read repeat lengths.
+repeatLen1:
+	// s ++
+	ADDQ $1, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = src[s-1] + 8
+	MOVBQZX -1(R_SRC), R_LEN
+	ADDL    $8, R_LEN
+	JMP     doCopyRepeat
+
+repeatLen2:
+	// s +=2
+	ADDQ $2, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
+	MOVWQZX -2(R_SRC), R_LEN
+	ADDL    $260, R_LEN
+	JMP     doCopyRepeat
+
+repeatLen3:
+	// s +=3
+	ADDQ $3, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ R_SRC, R_SEND
+	JA   errCorrupt
+
+	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
+	// Read one byte further back (just part of the tag, shifted out)
+	MOVL -4(R_SRC), R_LEN
+	SHRL $8, R_LEN
+	ADDL $65540, R_LEN
+	JMP  doCopyRepeat
 
 doCopy:
 	// This is the end of the outer "switch", when we have a copy tag.
 	//
 	// We assume that:
-	//	- CX == length && CX > 0
-	//	- DX == offset
-
-	// if offset <= 0 { etc }
-	CMPQ DX, $0
-	JLE  errCorrupt
+	//	- R_LEN == length && R_LEN > 0
+	//	- R_OFF == offset
 
 	// if d < offset { etc }
-	MOVQ DI, BX
-	SUBQ R8, BX
-	CMPQ BX, DX
+	MOVQ R_DST, R_TMP1
+	SUBQ R_DBASE, R_TMP1
+	CMPQ R_TMP1, R_OFF
 	JLT  errCorrupt
 
+	// Repeat values can skip the test above, since any offset > 0 will be in dst.
+doCopyRepeat:
+	// if offset <= 0 { etc }
+	CMPQ R_OFF, $0
+	JLE  errCorrupt
+
 	// if length > len(dst)-d { etc }
-	MOVQ R10, BX
-	SUBQ DI, BX
-	CMPQ CX, BX
+	MOVQ R_DEND, R_TMP1
+	SUBQ R_DST, R_TMP1
+	CMPQ R_LEN, R_TMP1
 	JGT  errCorrupt
 
 	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
 	//
 	// Set:
-	//	- R14 = len(dst)-d
-	//	- R15 = &dst[d-offset]
-	MOVQ R10, R14
-	SUBQ DI, R14
-	MOVQ DI, R15
-	SUBQ DX, R15
+	//	- R_TMP2 = len(dst)-d
+	//	- R_TMP3 = &dst[d-offset]
+	MOVQ R_DEND, R_TMP2
+	SUBQ R_DST, R_TMP2
+	MOVQ R_DST, R_TMP3
+	SUBQ R_OFF, R_TMP3
 
 	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
 	//
@@ -340,17 +415,17 @@ doCopy:
 	// }
 	// copy 16 bytes
 	// d += length
-	CMPQ CX, $16
+	CMPQ R_LEN, $16
 	JGT  slowForwardCopy
-	CMPQ DX, $8
+	CMPQ R_OFF, $8
 	JLT  slowForwardCopy
-	CMPQ R14, $16
+	CMPQ R_TMP2, $16
 	JLT  slowForwardCopy
-	MOVQ 0(R15), AX
-	MOVQ AX, 0(DI)
-	MOVQ 8(R15), BX
-	MOVQ BX, 8(DI)
-	ADDQ CX, DI
+	MOVQ 0(R_TMP3), R_TMP0
+	MOVQ R_TMP0, 0(R_DST)
+	MOVQ 8(R_TMP3), R_TMP1
+	MOVQ R_TMP1, 8(R_DST)
+	ADDQ R_LEN, R_DST
 	JMP  loop
 
 slowForwardCopy:
@@ -402,10 +477,13 @@ slowForwardCopy:
 	// if length > len(dst)-d-10 {
 	//   goto verySlowForwardCopy
 	// }
-	SUBQ $10, R14
-	CMPQ CX, R14
+	SUBQ $10, R_TMP2
+	CMPQ R_LEN, R_TMP2
 	JGT  verySlowForwardCopy
 
+	// We want to keep the offset, so we use R_TMP2 from here.
+	MOVQ R_OFF, R_TMP2
+
 makeOffsetAtLeast8:
 	// !!! As above, expand the pattern so that offset >= 8 and we can use
 	// 8-byte load/stores.
@@ -416,37 +494,37 @@ makeOffsetAtLeast8:
 	//   d      += offset
 	//   offset += offset
 	//   // The two previous lines together means that d-offset, and therefore
-	//   // R15, is unchanged.
+	//   // R_TMP3, is unchanged.
 	// }
-	CMPQ DX, $8
+	CMPQ R_TMP2, $8
 	JGE  fixUpSlowForwardCopy
-	MOVQ (R15), BX
-	MOVQ BX, (DI)
-	SUBQ DX, CX
-	ADDQ DX, DI
-	ADDQ DX, DX
+	MOVQ (R_TMP3), R_TMP1
+	MOVQ R_TMP1, (R_DST)
+	SUBQ R_TMP2, R_LEN
+	ADDQ R_TMP2, R_DST
+	ADDQ R_TMP2, R_TMP2
 	JMP  makeOffsetAtLeast8
 
 fixUpSlowForwardCopy:
-	// !!! Add length (which might be negative now) to d (implied by DI being
+	// !!! Add length (which might be negative now) to d (implied by R_DST being
 	// &dst[d]) so that d ends up at the right place when we jump back to the
-	// top of the loop. Before we do that, though, we save DI to AX so that, if
+	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
 	// length is positive, copying the remaining length bytes will write to the
 	// right place.
-	MOVQ DI, AX
-	ADDQ CX, DI
+	MOVQ R_DST, R_TMP0
+	ADDQ R_LEN, R_DST
 
 finishSlowForwardCopy:
 	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
 	// length means that we overrun, but as above, that will be fixed up by
 	// subsequent iterations of the outermost loop.
-	CMPQ CX, $0
+	CMPQ R_LEN, $0
 	JLE  loop
-	MOVQ (R15), BX
-	MOVQ BX, (AX)
-	ADDQ $8, R15
-	ADDQ $8, AX
-	SUBQ $8, CX
+	MOVQ (R_TMP3), R_TMP1
+	MOVQ R_TMP1, (R_TMP0)
+	ADDQ $8, R_TMP3
+	ADDQ $8, R_TMP0
+	SUBQ $8, R_LEN
 	JMP  finishSlowForwardCopy
 
 verySlowForwardCopy:
@@ -462,11 +540,11 @@ verySlowForwardCopy:
 	//     break
 	//   }
 	// }
-	MOVB (R15), BX
-	MOVB BX, (DI)
-	INCQ R15
-	INCQ DI
-	DECQ CX
+	MOVB (R_TMP3), R_TMP1
+	MOVB R_TMP1, (R_DST)
+	INCQ R_TMP3
+	INCQ R_DST
+	DECQ R_LEN
 	JNZ  verySlowForwardCopy
 	JMP  loop
 
@@ -477,7 +555,7 @@ end:
 	// This is the end of the "for s < len(src)".
 	//
 	// if d != len(dst) { etc }
-	CMPQ DI, R10
+	CMPQ R_DST, R_DEND
 	JNE  errCorrupt
 
 	// return 0
diff --git a/vendor/github.com/golang/snappy/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
similarity index 59%
rename from vendor/github.com/golang/snappy/decode_arm64.s
rename to vendor/github.com/klauspost/compress/s2/decode_arm64.s
index 7a3ead17ea..78e463f342 100644
--- a/vendor/github.com/golang/snappy/decode_arm64.s
+++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
@@ -8,6 +8,31 @@
 
 #include "textflag.h"
 
+#define R_TMP0 R2
+#define R_TMP1 R3
+#define R_LEN R4
+#define R_OFF R5
+#define R_SRC R6
+#define R_DST R7
+#define R_DBASE R8
+#define R_DLEN R9
+#define R_DEND R10
+#define R_SBASE R11
+#define R_SLEN R12
+#define R_SEND R13
+#define R_TMP2 R14
+#define R_TMP3 R15
+
+// TEST_SRC will check if R_SRC is <= SRC_END
+#define TEST_SRC() \
+	CMP R_SEND, R_SRC \
+	BGT errCorrupt
+
+// MOVD R_SRC, R_TMP1
+// SUB  R_SBASE, R_TMP1, R_TMP1
+// CMP  R_SLEN, R_TMP1
+// BGT  errCorrupt
+
 // The asm code generally follows the pure Go code in decode_other.go, except
 // where marked with a "!!!".
 
@@ -15,52 +40,53 @@
 //
 // All local variables fit into registers. The non-zero stack size is only to
 // spill registers and push args when issuing a CALL. The register allocation:
-//	- R2	scratch
-//	- R3	scratch
-//	- R4	length or x
-//	- R5	offset
-//	- R6	&src[s]
-//	- R7	&dst[d]
-//	+ R8	dst_base
-//	+ R9	dst_len
-//	+ R10	dst_base + dst_len
-//	+ R11	src_base
-//	+ R12	src_len
-//	+ R13	src_base + src_len
-//	- R14	used by doCopy
-//	- R15	used by doCopy
+//	- R_TMP0	scratch
+//	- R_TMP1	scratch
+//	- R_LEN	length or x
+//	- R_OFF	offset
+//	- R_SRC	&src[s]
+//	- R_DST	&dst[d]
+//	+ R_DBASE	dst_base
+//	+ R_DLEN	dst_len
+//	+ R_DEND	dst_base + dst_len
+//	+ R_SBASE	src_base
+//	+ R_SLEN	src_len
+//	+ R_SEND	src_base + src_len
+//	- R_TMP2	used by doCopy
+//	- R_TMP3	used by doCopy
 //
-// The registers R8-R13 (marked with a "+") are set at the start of the
+// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
 // function, and after a CALL returns, and are not otherwise modified.
 //
-// The d variable is implicitly R7 - R8,  and len(dst)-d is R10 - R7.
-// The s variable is implicitly R6 - R11, and len(src)-s is R13 - R6.
-TEXT ·decode(SB), NOSPLIT, $56-56
-	// Initialize R6, R7 and R8-R13.
-	MOVD dst_base+0(FP), R8
-	MOVD dst_len+8(FP), R9
-	MOVD R8, R7
-	MOVD R8, R10
-	ADD  R9, R10, R10
-	MOVD src_base+24(FP), R11
-	MOVD src_len+32(FP), R12
-	MOVD R11, R6
-	MOVD R11, R13
-	ADD  R12, R13, R13
+// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
+// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
+TEXT ·s2Decode(SB), NOSPLIT, $56-56
+	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
+	MOVD dst_base+0(FP), R_DBASE
+	MOVD dst_len+8(FP), R_DLEN
+	MOVD R_DBASE, R_DST
+	MOVD R_DBASE, R_DEND
+	ADD  R_DLEN, R_DEND, R_DEND
+	MOVD src_base+24(FP), R_SBASE
+	MOVD src_len+32(FP), R_SLEN
+	MOVD R_SBASE, R_SRC
+	MOVD R_SBASE, R_SEND
+	ADD  R_SLEN, R_SEND, R_SEND
+	MOVD $0, R_OFF
 
 loop:
 	// for s < len(src)
-	CMP R13, R6
+	CMP R_SEND, R_SRC
 	BEQ end
 
-	// R4 = uint32(src[s])
+	// R_LEN = uint32(src[s])
 	//
 	// switch src[s] & 0x03
-	MOVBU (R6), R4
-	MOVW  R4, R3
-	ANDW  $3, R3
+	MOVBU (R_SRC), R_LEN
+	MOVW  R_LEN, R_TMP1
+	ANDW  $3, R_TMP1
 	MOVW  $1, R1
-	CMPW  R1, R3
+	CMPW  R1, R_TMP1
 	BGE   tagCopy
 
 	// ----------------------------------------
@@ -70,35 +96,35 @@ loop:
 	// x := uint32(src[s] >> 2)
 	// switch
 	MOVW $60, R1
-	LSRW $2, R4, R4
-	CMPW R4, R1
+	LSRW $2, R_LEN, R_LEN
+	CMPW R_LEN, R1
 	BLS  tagLit60Plus
 
 	// case x < 60:
 	// s++
-	ADD $1, R6, R6
+	ADD $1, R_SRC, R_SRC
 
 doLit:
 	// This is the end of the inner "switch", when we have a literal tag.
 	//
-	// We assume that R4 == x and x fits in a uint32, where x is the variable
+	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
 	// used in the pure Go decode_other.go code.
 
 	// length = int(x) + 1
 	//
 	// Unlike the pure Go code, we don't need to check if length <= 0 because
-	// R4 can hold 64 bits, so the increment cannot overflow.
-	ADD $1, R4, R4
+	// R_LEN can hold 64 bits, so the increment cannot overflow.
+	ADD $1, R_LEN, R_LEN
 
 	// Prepare to check if copying length bytes will run past the end of dst or
 	// src.
 	//
-	// R2 = len(dst) - d
-	// R3 = len(src) - s
-	MOVD R10, R2
-	SUB  R7, R2, R2
-	MOVD R13, R3
-	SUB  R6, R3, R3
+	// R_TMP0 = len(dst) - d
+	// R_TMP1 = len(src) - s
+	MOVD R_DEND, R_TMP0
+	SUB  R_DST, R_TMP0, R_TMP0
+	MOVD R_SEND, R_TMP1
+	SUB  R_SRC, R_TMP1, R_TMP1
 
 	// !!! Try a faster technique for short (16 or fewer bytes) copies.
 	//
@@ -111,11 +137,11 @@ doLit:
 	// is contiguous in memory and so it needs to leave enough source bytes to
 	// read the next tag without refilling buffers, but Go's Decode assumes
 	// contiguousness (the src argument is a []byte).
-	CMP $16, R4
+	CMP $16, R_LEN
 	BGT callMemmove
-	CMP $16, R2
+	CMP $16, R_TMP0
 	BLT callMemmove
-	CMP $16, R3
+	CMP $16, R_TMP1
 	BLT callMemmove
 
 	// !!! Implement the copy from src to dst as a 16-byte load and store.
@@ -129,53 +155,55 @@ doLit:
 	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
 	// 16-byte loads and stores. This technique probably wouldn't be as
 	// effective on architectures that are fussier about alignment.
-	LDP 0(R6), (R14, R15)
-	STP (R14, R15), 0(R7)
+	LDP 0(R_SRC), (R_TMP2, R_TMP3)
+	STP (R_TMP2, R_TMP3), 0(R_DST)
 
 	// d += length
 	// s += length
-	ADD R4, R7, R7
-	ADD R4, R6, R6
+	ADD R_LEN, R_DST, R_DST
+	ADD R_LEN, R_SRC, R_SRC
 	B   loop
 
 callMemmove:
 	// if length > len(dst)-d || length > len(src)-s { etc }
-	CMP R2, R4
+	CMP R_TMP0, R_LEN
 	BGT errCorrupt
-	CMP R3, R4
+	CMP R_TMP1, R_LEN
 	BGT errCorrupt
 
 	// copy(dst[d:], src[s:s+length])
 	//
 	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
-	// R7, R6 and R4 as arguments. Coincidentally, we also need to spill those
+	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
 	// three registers to the stack, to save local variables across the CALL.
-	MOVD R7, 8(RSP)
-	MOVD R6, 16(RSP)
-	MOVD R4, 24(RSP)
-	MOVD R7, 32(RSP)
-	MOVD R6, 40(RSP)
-	MOVD R4, 48(RSP)
+	MOVD R_DST, 8(RSP)
+	MOVD R_SRC, 16(RSP)
+	MOVD R_LEN, 24(RSP)
+	MOVD R_DST, 32(RSP)
+	MOVD R_SRC, 40(RSP)
+	MOVD R_LEN, 48(RSP)
+	MOVD R_OFF, 56(RSP)
 	CALL runtime·memmove(SB)
 
 	// Restore local variables: unspill registers from the stack and
-	// re-calculate R8-R13.
-	MOVD 32(RSP), R7
-	MOVD 40(RSP), R6
-	MOVD 48(RSP), R4
-	MOVD dst_base+0(FP), R8
-	MOVD dst_len+8(FP), R9
-	MOVD R8, R10
-	ADD  R9, R10, R10
-	MOVD src_base+24(FP), R11
-	MOVD src_len+32(FP), R12
-	MOVD R11, R13
-	ADD  R12, R13, R13
+	// re-calculate R_DBASE-R_SEND.
+	MOVD 32(RSP), R_DST
+	MOVD 40(RSP), R_SRC
+	MOVD 48(RSP), R_LEN
+	MOVD 56(RSP), R_OFF
+	MOVD dst_base+0(FP), R_DBASE
+	MOVD dst_len+8(FP), R_DLEN
+	MOVD R_DBASE, R_DEND
+	ADD  R_DLEN, R_DEND, R_DEND
+	MOVD src_base+24(FP), R_SBASE
+	MOVD src_len+32(FP), R_SLEN
+	MOVD R_SBASE, R_SEND
+	ADD  R_SLEN, R_SEND, R_SEND
 
 	// d += length
 	// s += length
-	ADD R4, R7, R7
-	ADD R4, R6, R6
+	ADD R_LEN, R_DST, R_DST
+	ADD R_LEN, R_SRC, R_SRC
 	B   loop
 
 tagLit60Plus:
@@ -184,44 +212,41 @@ tagLit60Plus:
 	// s += x - 58; if uint(s) > uint(len(src)) { etc }
 	//
 	// checks. In the asm version, we code it once instead of once per switch case.
-	ADD  R4, R6, R6
-	SUB  $58, R6, R6
-	MOVD R6, R3
-	SUB  R11, R3, R3
-	CMP  R12, R3
-	BGT  errCorrupt
+	ADD R_LEN, R_SRC, R_SRC
+	SUB $58, R_SRC, R_SRC
+	TEST_SRC()
 
 	// case x == 60:
 	MOVW $61, R1
-	CMPW R1, R4
+	CMPW R1, R_LEN
 	BEQ  tagLit61
 	BGT  tagLit62Plus
 
 	// x = uint32(src[s-1])
-	MOVBU -1(R6), R4
+	MOVBU -1(R_SRC), R_LEN
 	B     doLit
 
 tagLit61:
 	// case x == 61:
 	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
-	MOVHU -2(R6), R4
+	MOVHU -2(R_SRC), R_LEN
 	B     doLit
 
 tagLit62Plus:
-	CMPW $62, R4
+	CMPW $62, R_LEN
 	BHI  tagLit63
 
 	// case x == 62:
 	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
-	MOVHU -3(R6), R4
-	MOVBU -1(R6), R3
-	ORR   R3<<16, R4
+	MOVHU -3(R_SRC), R_LEN
+	MOVBU -1(R_SRC), R_TMP1
+	ORR   R_TMP1<<16, R_LEN
 	B     doLit
 
 tagLit63:
 	// case x == 63:
 	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-	MOVWU -4(R6), R4
+	MOVWU -4(R_SRC), R_LEN
 	B     doLit
 
 	// The code above handles literal tags.
@@ -231,103 +256,155 @@ tagLit63:
 tagCopy4:
 	// case tagCopy4:
 	// s += 5
-	ADD $5, R6, R6
+	ADD $5, R_SRC, R_SRC
 
 	// if uint(s) > uint(len(src)) { etc }
-	MOVD R6, R3
-	SUB  R11, R3, R3
-	CMP  R12, R3
+	MOVD R_SRC, R_TMP1
+	SUB  R_SBASE, R_TMP1, R_TMP1
+	CMP  R_SLEN, R_TMP1
 	BGT  errCorrupt
 
 	// length = 1 + int(src[s-5])>>2
 	MOVD $1, R1
-	ADD  R4>>2, R1, R4
+	ADD  R_LEN>>2, R1, R_LEN
 
 	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
-	MOVWU -4(R6), R5
+	MOVWU -4(R_SRC), R_OFF
 	B     doCopy
 
 tagCopy2:
 	// case tagCopy2:
 	// s += 3
-	ADD $3, R6, R6
+	ADD $3, R_SRC, R_SRC
 
 	// if uint(s) > uint(len(src)) { etc }
-	MOVD R6, R3
-	SUB  R11, R3, R3
-	CMP  R12, R3
-	BGT  errCorrupt
+	TEST_SRC()
 
 	// length = 1 + int(src[s-3])>>2
 	MOVD $1, R1
-	ADD  R4>>2, R1, R4
+	ADD  R_LEN>>2, R1, R_LEN
 
 	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
-	MOVHU -2(R6), R5
+	MOVHU -2(R_SRC), R_OFF
 	B     doCopy
 
 tagCopy:
 	// We have a copy tag. We assume that:
-	//	- R3 == src[s] & 0x03
-	//	- R4 == src[s]
-	CMP $2, R3
+	//	- R_TMP1 == src[s] & 0x03
+	//	- R_LEN == src[s]
+	CMP $2, R_TMP1
 	BEQ tagCopy2
 	BGT tagCopy4
 
 	// case tagCopy1:
 	// s += 2
-	ADD $2, R6, R6
+	ADD $2, R_SRC, R_SRC
 
 	// if uint(s) > uint(len(src)) { etc }
-	MOVD R6, R3
-	SUB  R11, R3, R3
-	CMP  R12, R3
-	BGT  errCorrupt
+	TEST_SRC()
 
 	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
-	MOVD  R4, R5
-	AND   $0xe0, R5
-	MOVBU -1(R6), R3
-	ORR   R5<<3, R3, R5
+	// Calculate offset in R_TMP0 in case it is a repeat.
+	MOVD  R_LEN, R_TMP0
+	AND   $0xe0, R_TMP0
+	MOVBU -1(R_SRC), R_TMP1
+	ORR   R_TMP0<<3, R_TMP1, R_TMP0
 
 	// length = 4 + int(src[s-2])>>2&0x7
 	MOVD $7, R1
-	AND  R4>>2, R1, R4
-	ADD  $4, R4, R4
+	AND  R_LEN>>2, R1, R_LEN
+	ADD  $4, R_LEN, R_LEN
+
+	// check if repeat code with offset 0.
+	CMP $0, R_TMP0
+	BEQ repeatCode
+
+	// This is a regular copy, transfer our temporary value to R_OFF (offset)
+	MOVD R_TMP0, R_OFF
+	B    doCopy
+
+	// This is a repeat code.
+repeatCode:
+	// If length < 9, reuse last offset, with the length already calculated.
+	CMP $9, R_LEN
+	BLT doCopyRepeat
+	BEQ repeatLen1
+	CMP $10, R_LEN
+	BEQ repeatLen2
+
+repeatLen3:
+	// s +=3
+	ADD $3, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
+	MOVBU -1(R_SRC), R_TMP0
+	MOVHU -3(R_SRC), R_LEN
+	ORR   R_TMP0<<16, R_LEN, R_LEN
+	ADD   $65540, R_LEN, R_LEN
+	B     doCopyRepeat
+
+repeatLen2:
+	// s +=2
+	ADD $2, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
+	MOVHU -2(R_SRC), R_LEN
+	ADD   $260, R_LEN, R_LEN
+	B     doCopyRepeat
+
+repeatLen1:
+	// s +=1
+	ADD $1, R_SRC, R_SRC
+
+	// if uint(s) > uint(len(src)) { etc }
+	TEST_SRC()
+
+	// length = src[s-1] + 8
+	MOVBU -1(R_SRC), R_LEN
+	ADD   $8, R_LEN, R_LEN
+	B     doCopyRepeat
 
 doCopy:
 	// This is the end of the outer "switch", when we have a copy tag.
 	//
 	// We assume that:
-	//	- R4 == length && R4 > 0
-	//	- R5 == offset
-
-	// if offset <= 0 { etc }
-	MOVD $0, R1
-	CMP  R1, R5
-	BLE  errCorrupt
+	//	- R_LEN == length && R_LEN > 0
+	//	- R_OFF == offset
 
 	// if d < offset { etc }
-	MOVD R7, R3
-	SUB  R8, R3, R3
-	CMP  R5, R3
+	MOVD R_DST, R_TMP1
+	SUB  R_DBASE, R_TMP1, R_TMP1
+	CMP  R_OFF, R_TMP1
 	BLT  errCorrupt
 
+	// Repeat values can skip the test above, since any offset > 0 will be in dst.
+doCopyRepeat:
+
+	// if offset <= 0 { etc }
+	CMP $0, R_OFF
+	BLE errCorrupt
+
 	// if length > len(dst)-d { etc }
-	MOVD R10, R3
-	SUB  R7, R3, R3
-	CMP  R3, R4
+	MOVD R_DEND, R_TMP1
+	SUB  R_DST, R_TMP1, R_TMP1
+	CMP  R_TMP1, R_LEN
 	BGT  errCorrupt
 
 	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
 	//
 	// Set:
-	//	- R14 = len(dst)-d
-	//	- R15 = &dst[d-offset]
-	MOVD R10, R14
-	SUB  R7, R14, R14
-	MOVD R7, R15
-	SUB  R5, R15, R15
+	//	- R_TMP2 = len(dst)-d
+	//	- R_TMP3 = &dst[d-offset]
+	MOVD R_DEND, R_TMP2
+	SUB  R_DST, R_TMP2, R_TMP2
+	MOVD R_DST, R_TMP3
+	SUB  R_OFF, R_TMP3, R_TMP3
 
 	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
 	//
@@ -342,17 +419,17 @@ doCopy:
 	// }
 	// copy 16 bytes
 	// d += length
-	CMP  $16, R4
+	CMP  $16, R_LEN
 	BGT  slowForwardCopy
-	CMP  $8, R5
+	CMP  $8, R_OFF
 	BLT  slowForwardCopy
-	CMP  $16, R14
+	CMP  $16, R_TMP2
 	BLT  slowForwardCopy
-	MOVD 0(R15), R2
-	MOVD R2, 0(R7)
-	MOVD 8(R15), R3
-	MOVD R3, 8(R7)
-	ADD  R4, R7, R7
+	MOVD 0(R_TMP3), R_TMP0
+	MOVD R_TMP0, 0(R_DST)
+	MOVD 8(R_TMP3), R_TMP1
+	MOVD R_TMP1, 8(R_DST)
+	ADD  R_LEN, R_DST, R_DST
 	B    loop
 
 slowForwardCopy:
@@ -404,10 +481,13 @@ slowForwardCopy:
 	// if length > len(dst)-d-10 {
 	//   goto verySlowForwardCopy
 	// }
-	SUB $10, R14, R14
-	CMP R14, R4
+	SUB $10, R_TMP2, R_TMP2
+	CMP R_TMP2, R_LEN
 	BGT verySlowForwardCopy
 
+	// We want to keep the offset, so we use R_TMP2 from here.
+	MOVD R_OFF, R_TMP2
+
 makeOffsetAtLeast8:
 	// !!! As above, expand the pattern so that offset >= 8 and we can use
 	// 8-byte load/stores.
@@ -418,38 +498,38 @@ makeOffsetAtLeast8:
 	//   d      += offset
 	//   offset += offset
 	//   // The two previous lines together means that d-offset, and therefore
-	//   // R15, is unchanged.
+	//   // R_TMP3, is unchanged.
 	// }
-	CMP  $8, R5
+	CMP  $8, R_TMP2
 	BGE  fixUpSlowForwardCopy
-	MOVD (R15), R3
-	MOVD R3, (R7)
-	SUB  R5, R4, R4
-	ADD  R5, R7, R7
-	ADD  R5, R5, R5
+	MOVD (R_TMP3), R_TMP1
+	MOVD R_TMP1, (R_DST)
+	SUB  R_TMP2, R_LEN, R_LEN
+	ADD  R_TMP2, R_DST, R_DST
+	ADD  R_TMP2, R_TMP2, R_TMP2
 	B    makeOffsetAtLeast8
 
 fixUpSlowForwardCopy:
-	// !!! Add length (which might be negative now) to d (implied by R7 being
+	// !!! Add length (which might be negative now) to d (implied by R_DST being
 	// &dst[d]) so that d ends up at the right place when we jump back to the
-	// top of the loop. Before we do that, though, we save R7 to R2 so that, if
+	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
 	// length is positive, copying the remaining length bytes will write to the
 	// right place.
-	MOVD R7, R2
-	ADD  R4, R7, R7
+	MOVD R_DST, R_TMP0
+	ADD  R_LEN, R_DST, R_DST
 
 finishSlowForwardCopy:
 	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
 	// length means that we overrun, but as above, that will be fixed up by
 	// subsequent iterations of the outermost loop.
 	MOVD $0, R1
-	CMP  R1, R4
+	CMP  R1, R_LEN
 	BLE  loop
-	MOVD (R15), R3
-	MOVD R3, (R2)
-	ADD  $8, R15, R15
-	ADD  $8, R2, R2
-	SUB  $8, R4, R4
+	MOVD (R_TMP3), R_TMP1
+	MOVD R_TMP1, (R_TMP0)
+	ADD  $8, R_TMP3, R_TMP3
+	ADD  $8, R_TMP0, R_TMP0
+	SUB  $8, R_LEN, R_LEN
 	B    finishSlowForwardCopy
 
 verySlowForwardCopy:
@@ -465,12 +545,12 @@ verySlowForwardCopy:
 	//     break
 	//   }
 	// }
-	MOVB (R15), R3
-	MOVB R3, (R7)
-	ADD  $1, R15, R15
-	ADD  $1, R7, R7
-	SUB  $1, R4, R4
-	CBNZ R4, verySlowForwardCopy
+	MOVB (R_TMP3), R_TMP1
+	MOVB R_TMP1, (R_DST)
+	ADD  $1, R_TMP3, R_TMP3
+	ADD  $1, R_DST, R_DST
+	SUB  $1, R_LEN, R_LEN
+	CBNZ R_LEN, verySlowForwardCopy
 	B    loop
 
 	// The code above handles copy tags.
@@ -480,7 +560,7 @@ end:
 	// This is the end of the "for s < len(src)".
 	//
 	// if d != len(dst) { etc }
-	CMP R10, R7
+	CMP R_DEND, R_DST
 	BNE errCorrupt
 
 	// return 0
@@ -489,6 +569,6 @@ end:
 
 errCorrupt:
 	// return decodeErrCodeCorrupt
-	MOVD $1, R2
-	MOVD R2, ret+48(FP)
+	MOVD $1, R_TMP0
+	MOVD R_TMP0, ret+48(FP)
 	RET
diff --git a/vendor/github.com/golang/snappy/decode_asm.go b/vendor/github.com/klauspost/compress/s2/decode_asm.go
similarity index 66%
rename from vendor/github.com/golang/snappy/decode_asm.go
rename to vendor/github.com/klauspost/compress/s2/decode_asm.go
index 7082b34919..cb3576edd4 100644
--- a/vendor/github.com/golang/snappy/decode_asm.go
+++ b/vendor/github.com/klauspost/compress/s2/decode_asm.go
@@ -1,15 +1,17 @@
 // Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build (amd64 || arm64) && !appengine && gc && !noasm
+// +build amd64 arm64
 // +build !appengine
 // +build gc
 // +build !noasm
-// +build amd64 arm64
 
-package snappy
+package s2
 
 // decode has the same semantics as in decode_other.go.
 //
 //go:noescape
-func decode(dst, src []byte) int
+func s2Decode(dst, src []byte) int
diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go
new file mode 100644
index 0000000000..2cb55c2c77
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_other.go
@@ -0,0 +1,292 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (!amd64 && !arm64) || appengine || !gc || noasm
+// +build !amd64,!arm64 appengine !gc noasm
+
+package s2
+
+import (
+	"fmt"
+	"strconv"
+)
+
+// decode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func s2Decode(dst, src []byte) int {
+	const debug = false
+	if debug {
+		fmt.Println("Starting decode, dst len:", len(dst))
+	}
+	var d, s, length int
+	offset := 0
+
+	// As long as we can read at least 5 bytes...
+	for s < len(src)-5 {
+		// Removing bounds checks is SLOWER, when if doing
+		// in := src[s:s+5]
+		// Checked on Go 1.18
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				x = uint32(src[s-1])
+			case x == 61:
+				in := src[s : s+3]
+				x = uint32(in[1]) | uint32(in[2])<<8
+				s += 3
+			case x == 62:
+				in := src[s : s+4]
+				// Load as 32 bit and shift down.
+				x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
+				x >>= 8
+				s += 4
+			case x == 63:
+				in := src[s : s+5]
+				x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
+				s += 5
+			}
+			length = int(x) + 1
+			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+				if debug {
+					fmt.Println("corrupt: lit size", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Println("literals, length:", length, "d-after:", d+length)
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			length = int(src[s-2]) >> 2 & 0x7
+			if toffset == 0 {
+				if debug {
+					fmt.Print("(repeat) ")
+				}
+				// keep last offset
+				switch length {
+				case 5:
+					length = int(src[s]) + 4
+					s += 1
+				case 6:
+					in := src[s : s+2]
+					length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
+					s += 2
+				case 7:
+					in := src[s : s+3]
+					length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16)
+					s += 3
+				default: // 0-> 4
+				}
+			} else {
+				offset = toffset
+			}
+			length += 4
+		case tagCopy2:
+			in := src[s : s+3]
+			offset = int(uint32(in[1]) | uint32(in[2])<<8)
+			length = 1 + int(in[0])>>2
+			s += 3
+
+		case tagCopy4:
+			in := src[s : s+5]
+			offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
+			length = 1 + int(in[0])>>2
+			s += 5
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			if debug {
+				fmt.Println("corrupt: match, length", length, "offset:", offset, "dst avail:", len(dst)-d, "dst pos:", d)
+			}
+
+			return decodeErrCodeCorrupt
+		}
+
+		if debug {
+			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	// Remaining with extra checks...
+	for s < len(src) {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
+				if debug {
+					fmt.Println("corrupt: lit size", length)
+				}
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Println("literals, length:", length, "d-after:", d+length)
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = int(src[s-2]) >> 2 & 0x7
+			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			if toffset == 0 {
+				if debug {
+					fmt.Print("(repeat) ")
+				}
+				// keep last offset
+				switch length {
+				case 5:
+					s += 1
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-1])) + 4
+				case 6:
+					s += 2
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
+				case 7:
+					s += 3
+					if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+						return decodeErrCodeCorrupt
+					}
+					length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
+				default: // 0-> 4
+				}
+			} else {
+				offset = toffset
+			}
+			length += 4
+		case tagCopy2:
+			s += 3
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-3])>>2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+		case tagCopy4:
+			s += 5
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			if debug {
+				fmt.Println("corrupt: match, length", length, "offset:", offset, "dst avail:", len(dst)-d, "dst pos:", d)
+			}
+			return decodeErrCodeCorrupt
+		}
+
+		if debug {
+			fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	if d != len(dst) {
+		return decodeErrCodeCorrupt
+	}
+	return 0
+}
diff --git a/vendor/github.com/klauspost/compress/s2/dict.go b/vendor/github.com/klauspost/compress/s2/dict.go
new file mode 100644
index 0000000000..f125ad0963
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/dict.go
@@ -0,0 +1,350 @@
+// Copyright (c) 2022+ Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"bytes"
+	"encoding/binary"
+	"sync"
+)
+
+const (
+	// MinDictSize is the minimum dictionary size when repeat has been read.
+	MinDictSize = 16
+
+	// MaxDictSize is the maximum dictionary size when repeat has been read.
+	MaxDictSize = 65536
+
+	// MaxDictSrcOffset is the maximum offset where a dictionary entry can start.
+	MaxDictSrcOffset = 65535
+)
+
+// Dict contains a dictionary that can be used for encoding and decoding s2
+type Dict struct {
+	dict   []byte
+	repeat int // Repeat as index of dict
+
+	fast, better, best sync.Once
+	fastTable          *[1 << 14]uint16
+
+	betterTableShort *[1 << 14]uint16
+	betterTableLong  *[1 << 17]uint16
+
+	bestTableShort *[1 << 16]uint32
+	bestTableLong  *[1 << 19]uint32
+}
+
+// NewDict will read a dictionary.
+// It will return nil if the dictionary is invalid.
+func NewDict(dict []byte) *Dict {
+	if len(dict) == 0 {
+		return nil
+	}
+	var d Dict
+	// Repeat is the first value of the dict
+	r, n := binary.Uvarint(dict)
+	if n <= 0 {
+		return nil
+	}
+	dict = dict[n:]
+	d.dict = dict
+	if cap(d.dict) < len(d.dict)+16 {
+		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
+	}
+	if len(dict) < MinDictSize || len(dict) > MaxDictSize {
+		return nil
+	}
+	d.repeat = int(r)
+	if d.repeat > len(dict) {
+		return nil
+	}
+	return &d
+}
+
+// Bytes will return a serialized version of the dictionary.
+// The output can be sent to NewDict.
+func (d *Dict) Bytes() []byte {
+	dst := make([]byte, binary.MaxVarintLen16+len(d.dict))
+	return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...)
+}
+
+// MakeDict will create a dictionary.
+// 'data' must be at least MinDictSize.
+// If data is longer than MaxDictSize only the last MaxDictSize bytes will be used.
+// If searchStart is set the start repeat value will be set to the last
+// match of this content.
+// If no matches are found, it will attempt to find shorter matches.
+// This content should match the typical start of a block.
+// If at least 4 bytes cannot be matched, repeat is set to start of block.
+func MakeDict(data []byte, searchStart []byte) *Dict {
+	if len(data) == 0 {
+		return nil
+	}
+	if len(data) > MaxDictSize {
+		data = data[len(data)-MaxDictSize:]
+	}
+	var d Dict
+	dict := data
+	d.dict = dict
+	if cap(d.dict) < len(d.dict)+16 {
+		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
+	}
+	if len(dict) < MinDictSize {
+		return nil
+	}
+
+	// Find the longest match possible, last entry if multiple.
+	for s := len(searchStart); s > 4; s-- {
+		if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 {
+			d.repeat = idx
+			break
+		}
+	}
+
+	return &d
+}
+
+// MakeDictManual will create a dictionary.
+// 'data' must be at least MinDictSize and less than or equal to MaxDictSize.
+// A manual first repeat index into data must be provided.
+// It must be less than len(data)-8.
+func MakeDictManual(data []byte, firstIdx uint16) *Dict {
+	if len(data) < MinDictSize || int(firstIdx) >= len(data)-8 || len(data) > MaxDictSize {
+		return nil
+	}
+	var d Dict
+	dict := data
+	d.dict = dict
+	if cap(d.dict) < len(d.dict)+16 {
+		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
+	}
+
+	d.repeat = int(firstIdx)
+	return &d
+}
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func (d *Dict) Encode(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	dstP := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:dstP]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		dstP += emitLiteral(dst[dstP:], src)
+		return dst[:dstP]
+	}
+	n := encodeBlockDictGo(dst[dstP:], src, d)
+	if n > 0 {
+		dstP += n
+		return dst[:dstP]
+	}
+	// Not compressible
+	dstP += emitLiteral(dst[dstP:], src)
+	return dst[:dstP]
+}
+
+// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBetter compresses better than Encode but typically with a
+// 10-40% speed decrease on both compression and decompression.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func (d *Dict) EncodeBetter(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	dstP := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:dstP]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		dstP += emitLiteral(dst[dstP:], src)
+		return dst[:dstP]
+	}
+	n := encodeBlockBetterDict(dst[dstP:], src, d)
+	if n > 0 {
+		dstP += n
+		return dst[:dstP]
+	}
+	// Not compressible
+	dstP += emitLiteral(dst[dstP:], src)
+	return dst[:dstP]
+}
+
+// EncodeBest returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBest compresses as good as reasonably possible but with a
+// big speed decrease.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func (d *Dict) EncodeBest(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	dstP := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:dstP]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		dstP += emitLiteral(dst[dstP:], src)
+		return dst[:dstP]
+	}
+	n := encodeBlockBest(dst[dstP:], src, d)
+	if n > 0 {
+		dstP += n
+		return dst[:dstP]
+	}
+	// Not compressible
+	dstP += emitLiteral(dst[dstP:], src)
+	return dst[:dstP]
+}
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func (d *Dict) Decode(dst, src []byte) ([]byte, error) {
+	dLen, s, err := decodedLen(src)
+	if err != nil {
+		return nil, err
+	}
+	if dLen <= cap(dst) {
+		dst = dst[:dLen]
+	} else {
+		dst = make([]byte, dLen)
+	}
+	if s2DecodeDict(dst, src[s:], d) != 0 {
+		return nil, ErrCorrupt
+	}
+	return dst, nil
+}
+
+func (d *Dict) initFast() {
+	d.fast.Do(func() {
+		const (
+			tableBits    = 14
+			maxTableSize = 1 << tableBits
+		)
+
+		var table [maxTableSize]uint16
+		// We stop so any entry of length 8 can always be read.
+		for i := 0; i < len(d.dict)-8-2; i += 3 {
+			x0 := load64(d.dict, i)
+			h0 := hash6(x0, tableBits)
+			h1 := hash6(x0>>8, tableBits)
+			h2 := hash6(x0>>16, tableBits)
+			table[h0] = uint16(i)
+			table[h1] = uint16(i + 1)
+			table[h2] = uint16(i + 2)
+		}
+		d.fastTable = &table
+	})
+}
+
+func (d *Dict) initBetter() {
+	d.better.Do(func() {
+		const (
+			// Long hash matches.
+			lTableBits    = 17
+			maxLTableSize = 1 << lTableBits
+
+			// Short hash matches.
+			sTableBits    = 14
+			maxSTableSize = 1 << sTableBits
+		)
+
+		var lTable [maxLTableSize]uint16
+		var sTable [maxSTableSize]uint16
+
+		// We stop so any entry of length 8 can always be read.
+		for i := 0; i < len(d.dict)-8; i++ {
+			cv := load64(d.dict, i)
+			lTable[hash7(cv, lTableBits)] = uint16(i)
+			sTable[hash4(cv, sTableBits)] = uint16(i)
+		}
+		d.betterTableShort = &sTable
+		d.betterTableLong = &lTable
+	})
+}
+
+func (d *Dict) initBest() {
+	d.best.Do(func() {
+		const (
+			// Long hash matches.
+			lTableBits    = 19
+			maxLTableSize = 1 << lTableBits
+
+			// Short hash matches.
+			sTableBits    = 16
+			maxSTableSize = 1 << sTableBits
+		)
+
+		var lTable [maxLTableSize]uint32
+		var sTable [maxSTableSize]uint32
+
+		// We stop so any entry of length 8 can always be read.
+		for i := 0; i < len(d.dict)-8; i++ {
+			cv := load64(d.dict, i)
+			hashL := hash8(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL := lTable[hashL]
+			candidateS := sTable[hashS]
+			lTable[hashL] = uint32(i) | candidateL<<16
+			sTable[hashS] = uint32(i) | candidateS<<16
+		}
+		d.bestTableShort = &sTable
+		d.bestTableLong = &lTable
+	})
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go
new file mode 100644
index 0000000000..20b802270a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode.go
@@ -0,0 +1,414 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"encoding/binary"
+	"math"
+	"math/bits"
+	"sync"
+
+	"github.com/klauspost/compress/internal/race"
+)
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func Encode(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlock(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+var estblockPool [2]sync.Pool
+
+// EstimateBlockSize will perform a very fast compression
+// without outputting the result and return the compressed output size.
+// The function returns -1 if no improvement could be achieved.
+// Using actual compression will most often produce better compression than the estimate.
+func EstimateBlockSize(src []byte) (d int) {
+	if len(src) <= inputMargin || int64(len(src)) > 0xffffffff {
+		return -1
+	}
+	if len(src) <= 1024 {
+		const sz, pool = 2048, 0
+		tmp, ok := estblockPool[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer estblockPool[pool].Put(tmp)
+
+		d = calcBlockSizeSmall(src, tmp)
+	} else {
+		const sz, pool = 32768, 1
+		tmp, ok := estblockPool[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer estblockPool[pool].Put(tmp)
+
+		d = calcBlockSize(src, tmp)
+	}
+
+	if d == 0 {
+		return -1
+	}
+	// Size of the varint encoded block size.
+	d += (bits.Len64(uint64(len(src))) + 7) / 7
+
+	if d >= len(src) {
+		return -1
+	}
+	return d
+}
+
+// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBetter compresses better than Encode but typically with a
+// 10-40% speed decrease on both compression and decompression.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeBetter(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlockBetter(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeBest returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// EncodeBest compresses as good as reasonably possible but with a
+// big speed decrease.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeBest(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlockBest(dst[d:], src, nil)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeSnappy returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappy(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+
+	n := encodeBlockSnappy(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeSnappyBetter returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappyBetter(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+
+	n := encodeBlockBetterSnappy(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// EncodeSnappyBest returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The output is Snappy compatible and will likely decompress faster.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func EncodeSnappyBest(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+
+	n := encodeBlockBestSnappy(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// ConcatBlocks will concatenate the supplied blocks and append them to the supplied destination.
+// If the destination is nil or too small, a new will be allocated.
+// The blocks are not validated, so garbage in = garbage out.
+// dst may not overlap block data.
+// Any data in dst is preserved as is, so it will not be considered a block.
+func ConcatBlocks(dst []byte, blocks ...[]byte) ([]byte, error) {
+	totalSize := uint64(0)
+	compSize := 0
+	for _, b := range blocks {
+		l, hdr, err := decodedLen(b)
+		if err != nil {
+			return nil, err
+		}
+		totalSize += uint64(l)
+		compSize += len(b) - hdr
+	}
+	if totalSize == 0 {
+		dst = append(dst, 0)
+		return dst, nil
+	}
+	if totalSize > math.MaxUint32 {
+		return nil, ErrTooLarge
+	}
+	var tmp [binary.MaxVarintLen32]byte
+	hdrSize := binary.PutUvarint(tmp[:], totalSize)
+	wantSize := hdrSize + compSize
+
+	if cap(dst)-len(dst) < wantSize {
+		dst = append(make([]byte, 0, wantSize+len(dst)), dst...)
+	}
+	dst = append(dst, tmp[:hdrSize]...)
+	for _, b := range blocks {
+		_, hdr, err := decodedLen(b)
+		if err != nil {
+			return nil, err
+		}
+		dst = append(dst, b[hdr:]...)
+	}
+	return dst, nil
+}
+
+// inputMargin is the minimum number of extra input bytes to keep, inside
+// encodeBlock's inner loop. On some architectures, this margin lets us
+// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
+// literals can be implemented as a single load to and store from a 16-byte
+// register. That literal's actual length can be as short as 1 byte, so this
+// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
+// the encoding loop will fix up the copy overrun, and this inputMargin ensures
+// that we don't overrun the dst and src buffers.
+const inputMargin = 8
+
+// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
+// will be accepted by the encoder.
+const minNonLiteralBlockSize = 32
+
+const intReduction = 2 - (1 << (^uint(0) >> 63)) // 1 (32 bits) or 0 (64 bits)
+
+// MaxBlockSize is the maximum value where MaxEncodedLen will return a valid block size.
+// Blocks this big are highly discouraged, though.
+// Half the size on 32 bit systems.
+const MaxBlockSize = (1<<(32-intReduction) - 1) - binary.MaxVarintLen32 - 5
+
+// MaxEncodedLen returns the maximum length of a snappy block, given its
+// uncompressed length.
+//
+// It will return a negative value if srcLen is too large to encode.
+// 32 bit platforms will have lower thresholds for rejecting big content.
+func MaxEncodedLen(srcLen int) int {
+	n := uint64(srcLen)
+	if intReduction == 1 {
+		// 32 bits
+		if n > math.MaxInt32 {
+			// Also includes negative.
+			return -1
+		}
+	} else if n > 0xffffffff {
+		// 64 bits
+		// Also includes negative.
+		return -1
+	}
+	// Size of the varint encoded block size.
+	n = n + uint64((bits.Len64(n)+7)/7)
+
+	// Add maximum size of encoding block as literals.
+	n += uint64(literalExtraSize(int64(srcLen)))
+	if intReduction == 1 {
+		// 32 bits
+		if n > math.MaxInt32 {
+			return -1
+		}
+	} else if n > 0xffffffff {
+		// 64 bits
+		// Also includes negative.
+		return -1
+	}
+	return int(n)
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go
new file mode 100644
index 0000000000..9977045696
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_all.go
@@ -0,0 +1,1068 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+)
+
+func load32(b []byte, i int) uint32 {
+	return binary.LittleEndian.Uint32(b[i:])
+}
+
+func load64(b []byte, i int) uint64 {
+	return binary.LittleEndian.Uint64(b[i:])
+}
+
+// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash6(u uint64, h uint8) uint32 {
+	const prime6bytes = 227718039650203
+	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
+}
+
+func encodeGo(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:d]
+	}
+	if len(src) < minNonLiteralBlockSize {
+		d += emitLiteral(dst[d:], src)
+		return dst[:d]
+	}
+	n := encodeBlockGo(dst[d:], src)
+	if n > 0 {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	d += emitLiteral(dst[d:], src)
+	return dst[:d]
+}
+
+// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockGo(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+
+		debug = false
+	)
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				if debug {
+					// Validate match.
+					if s <= candidate {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := src[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+				if nextEmit > 0 {
+					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+					d += emitRepeat(dst[d:], repeat, s-base)
+				} else {
+					// First match, cannot be repeat.
+					d += emitCopy(dst[d:], repeat, s-base)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards.
+		// The top bytes will be rechecked to get the full match.
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopy(dst[d:], repeat, s-base)
+			if debug {
+				// Validate match.
+				if s <= candidate {
+					panic("s <= candidate")
+				}
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if debug && s == candidate {
+				panic("s == candidate")
+			}
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+func encodeBlockSnappyGo(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+	)
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+
+				d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+			if false {
+				// Validate match.
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockDictGo(dst, src []byte, dict *Dict) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+		maxAhead     = 8 // maximum bytes ahead without checking sLimit
+
+		debug = false
+	)
+	dict.initFast()
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if sLimit > MaxDictSrcOffset-maxAhead {
+		sLimit = MaxDictSrcOffset - maxAhead
+	}
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form can start with a dict entry (copy or repeat).
+	s := 0
+
+	// Convert dict repeat to offset
+	repeat := len(dict.dict) - dict.repeat
+	cv := load64(src, 0)
+
+	// While in dict
+searchDict:
+	for {
+		// Next src position to check
+		nextS := s + (s-nextEmit)>>6 + 4
+		hash0 := hash6(cv, tableBits)
+		hash1 := hash6(cv>>8, tableBits)
+		if nextS > sLimit {
+			if debug {
+				fmt.Println("slimit reached", s, nextS)
+			}
+			break searchDict
+		}
+		candidateDict := int(dict.fastTable[hash0])
+		candidateDict2 := int(dict.fastTable[hash1])
+		candidate2 := int(table[hash1])
+		candidate := int(table[hash0])
+		table[hash0] = uint32(s)
+		table[hash1] = uint32(s + 1)
+		hash2 := hash6(cv>>16, tableBits)
+
+		// Check repeat at offset checkRep.
+		const checkRep = 1
+
+		if repeat > s {
+			candidate := len(dict.dict) - repeat + s
+			if repeat-s >= 4 && uint32(cv) == load32(dict.dict, candidate) {
+				// Extend back
+				base := s
+				for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+				if debug && nextEmit != base {
+					fmt.Println("emitted ", base-nextEmit, "literals")
+				}
+				s += 4
+				candidate += 4
+				for candidate < len(dict.dict)-8 && s <= len(src)-8 {
+					if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				d += emitRepeat(dst[d:], repeat, s-base)
+				if debug {
+					fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					break searchDict
+				}
+				cv = load64(src, s)
+				continue
+			}
+		} else if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+			base := s + checkRep
+			// Extend back
+			for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+				i--
+				base--
+			}
+			d += emitLiteral(dst[d:], src[nextEmit:base])
+			if debug && nextEmit != base {
+				fmt.Println("emitted ", base-nextEmit, "literals")
+			}
+
+			// Extend forward
+			candidate := s - repeat + 4 + checkRep
+			s += 4 + checkRep
+			for s <= sLimit {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+			if debug {
+				// Validate match.
+				if s <= candidate {
+					panic("s <= candidate")
+				}
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			if nextEmit > 0 {
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], repeat, s-base)
+			} else {
+				// First match, cannot be repeat.
+				d += emitCopy(dst[d:], repeat, s-base)
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				break searchDict
+			}
+			if debug {
+				fmt.Println("emitted reg repeat", s-base, "s:", s)
+			}
+			cv = load64(src, s)
+			continue searchDict
+		}
+		if s == 0 {
+			cv = load64(src, nextS)
+			s = nextS
+			continue searchDict
+		}
+		// Start with table. These matches will always be closer.
+		if uint32(cv) == load32(src, candidate) {
+			goto emitMatch
+		}
+		candidate = int(table[hash2])
+		if uint32(cv>>8) == load32(src, candidate2) {
+			table[hash2] = uint32(s + 2)
+			candidate = candidate2
+			s++
+			goto emitMatch
+		}
+
+		// Check dict. Dicts have longer offsets, so we want longer matches.
+		if cv == load64(dict.dict, candidateDict) {
+			table[hash2] = uint32(s + 2)
+			goto emitDict
+		}
+
+		candidateDict = int(dict.fastTable[hash2])
+		// Check if upper 7 bytes match
+		if candidateDict2 >= 1 {
+			if cv^load64(dict.dict, candidateDict2-1) < (1 << 8) {
+				table[hash2] = uint32(s + 2)
+				candidateDict = candidateDict2
+				s++
+				goto emitDict
+			}
+		}
+
+		table[hash2] = uint32(s + 2)
+		if uint32(cv>>16) == load32(src, candidate) {
+			s += 2
+			goto emitMatch
+		}
+		if candidateDict >= 2 {
+			// Check if upper 6 bytes match
+			if cv^load64(dict.dict, candidateDict-2) < (1 << 16) {
+				s += 2
+				goto emitDict
+			}
+		}
+
+		cv = load64(src, nextS)
+		s = nextS
+		continue searchDict
+
+	emitDict:
+		{
+			if debug {
+				if load32(dict.dict, candidateDict) != load32(src, s) {
+					panic("dict emit mismatch")
+				}
+			}
+			// Extend backwards.
+			// The top bytes will be rechecked to get the full match.
+			for candidateDict > 0 && s > nextEmit && dict.dict[candidateDict-1] == src[s-1] {
+				candidateDict--
+				s--
+			}
+
+			// Bail if we exceed the maximum size.
+			if d+(s-nextEmit) > dstLimit {
+				return 0
+			}
+
+			// A 4-byte match has been found. We'll later see if more than 4 bytes
+			// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+			// them as literal bytes.
+
+			d += emitLiteral(dst[d:], src[nextEmit:s])
+			if debug && nextEmit != s {
+				fmt.Println("emitted ", s-nextEmit, "literals")
+			}
+			{
+				// Invariant: we have a 4-byte match at s, and no need to emit any
+				// literal bytes prior to s.
+				base := s
+				repeat = s + (len(dict.dict)) - candidateDict
+
+				// Extend the 4-byte match as long as possible.
+				s += 4
+				candidateDict += 4
+				for s <= len(src)-8 && len(dict.dict)-candidateDict >= 8 {
+					if diff := load64(src, s) ^ load64(dict.dict, candidateDict); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidateDict += 8
+				}
+
+				// Matches longer than 64 are split.
+				if s <= sLimit || s-base < 8 {
+					d += emitCopy(dst[d:], repeat, s-base)
+				} else {
+					// Split to ensure we don't start a copy within next block
+					d += emitCopy(dst[d:], repeat, 4)
+					d += emitRepeat(dst[d:], repeat, s-base-4)
+				}
+				if false {
+					// Validate match.
+					if s <= candidate {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := dict.dict[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+				if debug {
+					fmt.Println("emitted dict copy, length", s-base, "offset:", repeat, "s:", s)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					break searchDict
+				}
+
+				if d > dstLimit {
+					// Do we have space for more, if not bail.
+					return 0
+				}
+
+				// Index and continue loop to try new candidate.
+				x := load64(src, s-2)
+				m2Hash := hash6(x, tableBits)
+				currHash := hash6(x>>8, tableBits)
+				table[m2Hash] = uint32(s - 2)
+				table[currHash] = uint32(s - 1)
+				cv = load64(src, s)
+			}
+			continue
+		}
+	emitMatch:
+
+		// Extend backwards.
+		// The top bytes will be rechecked to get the full match.
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", s-nextEmit, "literals")
+		}
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopy(dst[d:], repeat, s-base)
+			if debug {
+				// Validate match.
+				if s <= candidate {
+					panic("s <= candidate")
+				}
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+			if debug {
+				fmt.Println("emitted src copy, length", s-base, "offset:", repeat, "s:", s)
+			}
+			nextEmit = s
+			if s >= sLimit {
+				break searchDict
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if debug && s == candidate {
+				panic("s == candidate")
+			}
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+	// Search without dict:
+	if repeat > s {
+		repeat = 0
+	}
+
+	// No more dict
+	sLimit = len(src) - inputMargin
+	if s >= sLimit {
+		goto emitRemainder
+	}
+	if debug {
+		fmt.Println("non-dict matching at", s, "repeat:", repeat)
+	}
+	cv = load64(src, s)
+	if debug {
+		fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
+	}
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if repeat > 0 && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+				if debug && nextEmit != base {
+					fmt.Println("emitted ", base-nextEmit, "literals")
+				}
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				if debug {
+					// Validate match.
+					if s <= candidate {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := src[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+				if nextEmit > 0 {
+					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+					d += emitRepeat(dst[d:], repeat, s-base)
+				} else {
+					// First match, cannot be repeat.
+					d += emitCopy(dst[d:], repeat, s-base)
+				}
+				if debug {
+					fmt.Println("emitted src repeat length", s-base, "offset:", repeat, "s:", s)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards.
+		// The top bytes will be rechecked to get the full match.
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", s-nextEmit, "literals")
+		}
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopy(dst[d:], repeat, s-base)
+			if debug {
+				// Validate match.
+				if s <= candidate {
+					panic("s <= candidate")
+				}
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+			if debug {
+				fmt.Println("emitted src copy, length", s-base, "offset:", repeat, "s:", s)
+			}
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if debug && s == candidate {
+				panic("s == candidate")
+			}
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", len(src)-nextEmit, "literals")
+		}
+	}
+	return d
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
new file mode 100644
index 0000000000..7aadd255fe
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
@@ -0,0 +1,317 @@
+//go:build !appengine && !noasm && gc
+// +build !appengine,!noasm,gc
+
+package s2
+
+import (
+	"sync"
+
+	"github.com/klauspost/compress/internal/race"
+)
+
+const hasAmd64Asm = true
+
+var encPools [4]sync.Pool
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+
+	if len(src) >= 4<<20 {
+		const sz, pool = 65536, 0
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeBlockAsm(dst, src, tmp)
+	}
+	if len(src) >= limit12B {
+		const sz, pool = 65536, 0
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeBlockAsm4MB(dst, src, tmp)
+	}
+	if len(src) >= limit10B {
+		const sz, pool = 16384, 1
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeBlockAsm12B(dst, src, tmp)
+	}
+	if len(src) >= limit8B {
+		const sz, pool = 4096, 2
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeBlockAsm10B(dst, src, tmp)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	const sz, pool = 1024, 3
+	tmp, ok := encPools[pool].Get().(*[sz]byte)
+	if !ok {
+		tmp = &[sz]byte{}
+	}
+	race.WriteSlice(tmp[:])
+	defer encPools[pool].Put(tmp)
+	return encodeBlockAsm8B(dst, src, tmp)
+}
+
+var encBetterPools [5]sync.Pool
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetter(dst, src []byte) (d int) {
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+
+	if len(src) > 4<<20 {
+		const sz, pool = 589824, 0
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+		return encodeBetterBlockAsm(dst, src, tmp)
+	}
+	if len(src) >= limit12B {
+		const sz, pool = 589824, 0
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+
+		return encodeBetterBlockAsm4MB(dst, src, tmp)
+	}
+	if len(src) >= limit10B {
+		const sz, pool = 81920, 0
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+
+		return encodeBetterBlockAsm12B(dst, src, tmp)
+	}
+	if len(src) >= limit8B {
+		const sz, pool = 20480, 1
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+		return encodeBetterBlockAsm10B(dst, src, tmp)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	const sz, pool = 5120, 2
+	tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+	if !ok {
+		tmp = &[sz]byte{}
+	}
+	race.WriteSlice(tmp[:])
+	defer encBetterPools[pool].Put(tmp)
+	return encodeBetterBlockAsm8B(dst, src, tmp)
+}
+
+// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockSnappy(dst, src []byte) (d int) {
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+	if len(src) > 65536 {
+		const sz, pool = 65536, 0
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeSnappyBlockAsm(dst, src, tmp)
+	}
+	if len(src) >= limit12B {
+		const sz, pool = 65536, 0
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeSnappyBlockAsm64K(dst, src, tmp)
+	}
+	if len(src) >= limit10B {
+		const sz, pool = 16384, 1
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeSnappyBlockAsm12B(dst, src, tmp)
+	}
+	if len(src) >= limit8B {
+		const sz, pool = 4096, 2
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeSnappyBlockAsm10B(dst, src, tmp)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	const sz, pool = 1024, 3
+	tmp, ok := encPools[pool].Get().(*[sz]byte)
+	if !ok {
+		tmp = &[sz]byte{}
+	}
+	race.WriteSlice(tmp[:])
+	defer encPools[pool].Put(tmp)
+	return encodeSnappyBlockAsm8B(dst, src, tmp)
+}
+
+// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterSnappy(dst, src []byte) (d int) {
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+
+	const (
+		// Use 12 bit table when less than...
+		limit12B = 16 << 10
+		// Use 10 bit table when less than...
+		limit10B = 4 << 10
+		// Use 8 bit table when less than...
+		limit8B = 512
+	)
+	if len(src) > 65536 {
+		const sz, pool = 589824, 0
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+		return encodeSnappyBetterBlockAsm(dst, src, tmp)
+	}
+
+	if len(src) >= limit12B {
+		const sz, pool = 294912, 4
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+
+		return encodeSnappyBetterBlockAsm64K(dst, src, tmp)
+	}
+	if len(src) >= limit10B {
+		const sz, pool = 81920, 0
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+
+		return encodeSnappyBetterBlockAsm12B(dst, src, tmp)
+	}
+	if len(src) >= limit8B {
+		const sz, pool = 20480, 1
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+		return encodeSnappyBetterBlockAsm10B(dst, src, tmp)
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	const sz, pool = 5120, 2
+	tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+	if !ok {
+		tmp = &[sz]byte{}
+	}
+	race.WriteSlice(tmp[:])
+	defer encBetterPools[pool].Put(tmp)
+	return encodeSnappyBetterBlockAsm8B(dst, src, tmp)
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go
new file mode 100644
index 0000000000..47bac74234
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_best.go
@@ -0,0 +1,796 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"fmt"
+	"math"
+	"math/bits"
+)
+
+// encodeBlockBest encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 19
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 16
+		maxSTableSize = 1 << sTableBits
+
+		inputMargin = 8 + 2
+
+		debug = false
+	)
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	sLimitDict := len(src) - inputMargin
+	if sLimitDict > MaxDictSrcOffset-inputMargin {
+		sLimitDict = MaxDictSrcOffset - inputMargin
+	}
+
+	var lTable [maxLTableSize]uint64
+	var sTable [maxSTableSize]uint64
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	repeat := 1
+	if dict != nil {
+		dict.initBest()
+		s = 0
+		repeat = len(dict.dict) - dict.repeat
+	}
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	const lowbitMask = 0xffffffff
+	getCur := func(x uint64) int {
+		return int(x & lowbitMask)
+	}
+	getPrev := func(x uint64) int {
+		return int(x >> 32)
+	}
+	const maxSkip = 64
+
+	for {
+		type match struct {
+			offset    int
+			s         int
+			length    int
+			score     int
+			rep, dict bool
+		}
+		var best match
+		for {
+			// Next src position to check
+			nextS := (s-nextEmit)>>8 + 1
+			if nextS > maxSkip {
+				nextS = s + maxSkip
+			} else {
+				nextS += s
+			}
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			if dict != nil && s >= MaxDictSrcOffset {
+				dict = nil
+				if repeat > s {
+					repeat = math.MinInt32
+				}
+			}
+			hashL := hash8(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL := lTable[hashL]
+			candidateS := sTable[hashS]
+
+			score := func(m match) int {
+				// Matches that are longer forward are penalized since we must emit it as a literal.
+				score := m.length - m.s
+				if nextEmit == m.s {
+					// If we do not have to emit literals, we save 1 byte
+					score++
+				}
+				offset := m.s - m.offset
+				if m.rep {
+					return score - emitRepeatSize(offset, m.length)
+				}
+				return score - emitCopySize(offset, m.length)
+			}
+
+			matchAt := func(offset, s int, first uint32, rep bool) match {
+				if best.length != 0 && best.s-best.offset == s-offset {
+					// Don't retest if we have the same offset.
+					return match{offset: offset, s: s}
+				}
+				if load32(src, offset) != first {
+					return match{offset: offset, s: s}
+				}
+				m := match{offset: offset, s: s, length: 4 + offset, rep: rep}
+				s += 4
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[m.length] {
+							m.length++
+							s++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
+						m.length += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					m.length += 8
+				}
+				m.length -= offset
+				m.score = score(m)
+				if m.score <= -m.s {
+					// Eliminate if no savings, we might find a better one.
+					m.length = 0
+				}
+				return m
+			}
+			matchDict := func(candidate, s int, first uint32, rep bool) match {
+				if s >= MaxDictSrcOffset {
+					return match{offset: candidate, s: s}
+				}
+				// Calculate offset as if in continuous array with s
+				offset := -len(dict.dict) + candidate
+				if best.length != 0 && best.s-best.offset == s-offset && !rep {
+					// Don't retest if we have the same offset.
+					return match{offset: offset, s: s}
+				}
+
+				if load32(dict.dict, candidate) != first {
+					return match{offset: offset, s: s}
+				}
+				m := match{offset: offset, s: s, length: 4 + candidate, rep: rep, dict: true}
+				s += 4
+				if !rep {
+					for s < sLimitDict && m.length < len(dict.dict) {
+						if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
+							if src[s] == dict.dict[m.length] {
+								m.length++
+								s++
+								continue
+							}
+							break
+						}
+						if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
+							m.length += bits.TrailingZeros64(diff) >> 3
+							break
+						}
+						s += 8
+						m.length += 8
+					}
+				} else {
+					for s < len(src) && m.length < len(dict.dict) {
+						if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
+							if src[s] == dict.dict[m.length] {
+								m.length++
+								s++
+								continue
+							}
+							break
+						}
+						if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
+							m.length += bits.TrailingZeros64(diff) >> 3
+							break
+						}
+						s += 8
+						m.length += 8
+					}
+				}
+				m.length -= candidate
+				m.score = score(m)
+				if m.score <= -m.s {
+					// Eliminate if no savings, we might find a better one.
+					m.length = 0
+				}
+				return m
+			}
+
+			bestOf := func(a, b match) match {
+				if b.length == 0 {
+					return a
+				}
+				if a.length == 0 {
+					return b
+				}
+				as := a.score + b.s
+				bs := b.score + a.s
+				if as >= bs {
+					return a
+				}
+				return b
+			}
+
+			if s > 0 {
+				best = bestOf(matchAt(getCur(candidateL), s, uint32(cv), false), matchAt(getPrev(candidateL), s, uint32(cv), false))
+				best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv), false))
+				best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv), false))
+			}
+			if dict != nil {
+				candidateL := dict.bestTableLong[hashL]
+				candidateS := dict.bestTableShort[hashS]
+				best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
+				best = bestOf(best, matchDict(int(candidateL>>16), s, uint32(cv), false))
+				best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
+				best = bestOf(best, matchDict(int(candidateS>>16), s, uint32(cv), false))
+			}
+			{
+				if (dict == nil || repeat <= s) && repeat > 0 {
+					best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
+				} else if s-repeat < -4 && dict != nil {
+					candidate := len(dict.dict) - (repeat - s)
+					best = bestOf(best, matchDict(candidate, s, uint32(cv), true))
+					candidate++
+					best = bestOf(best, matchDict(candidate, s+1, uint32(cv>>8), true))
+				}
+
+				if best.length > 0 {
+					hashS := hash4(cv>>8, sTableBits)
+					// s+1
+					nextShort := sTable[hashS]
+					s := s + 1
+					cv := load64(src, s)
+					hashL := hash8(cv, lTableBits)
+					nextLong := lTable[hashL]
+					best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
+					best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
+					best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
+					best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
+
+					// Dict at + 1
+					if dict != nil {
+						candidateL := dict.bestTableLong[hashL]
+						candidateS := dict.bestTableShort[hashS]
+
+						best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
+						best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
+					}
+
+					// s+2
+					if true {
+						hashS := hash4(cv>>8, sTableBits)
+
+						nextShort = sTable[hashS]
+						s++
+						cv = load64(src, s)
+						hashL := hash8(cv, lTableBits)
+						nextLong = lTable[hashL]
+
+						if (dict == nil || repeat <= s) && repeat > 0 {
+							// Repeat at + 2
+							best = bestOf(best, matchAt(s-repeat, s, uint32(cv), true))
+						} else if repeat-s > 4 && dict != nil {
+							candidate := len(dict.dict) - (repeat - s)
+							best = bestOf(best, matchDict(candidate, s, uint32(cv), true))
+						}
+						best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
+						best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
+						best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
+						best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
+
+						// Dict at +2
+						// Very small gain
+						if dict != nil {
+							candidateL := dict.bestTableLong[hashL]
+							candidateS := dict.bestTableShort[hashS]
+
+							best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
+							best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
+						}
+					}
+					// Search for a match at best match end, see if that is better.
+					// Allow some bytes at the beginning to mismatch.
+					// Sweet spot is around 1-2 bytes, but depends on input.
+					// The skipped bytes are tested in Extend backwards,
+					// and still picked up as part of the match if they do.
+					const skipBeginning = 2
+					const skipEnd = 1
+					if sAt := best.s + best.length - skipEnd; sAt < sLimit {
+
+						sBack := best.s + skipBeginning - skipEnd
+						backL := best.length - skipBeginning
+						// Load initial values
+						cv = load64(src, sBack)
+
+						// Grab candidates...
+						next := lTable[hash8(load64(src, sAt), lTableBits)]
+
+						if checkAt := getCur(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+						}
+						if checkAt := getPrev(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+						}
+						// Disabled: Extremely small gain
+						if false {
+							next = sTable[hash4(load64(src, sAt), sTableBits)]
+							if checkAt := getCur(next) - backL; checkAt > 0 {
+								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+							}
+							if checkAt := getPrev(next) - backL; checkAt > 0 {
+								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+							}
+						}
+					}
+				}
+			}
+
+			// Update table
+			lTable[hashL] = uint64(s) | candidateL<<32
+			sTable[hashS] = uint64(s) | candidateS<<32
+
+			if best.length > 0 {
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards, not needed for repeats...
+		s = best.s
+		if !best.rep && !best.dict {
+			for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
+				best.offset--
+				best.length++
+				s--
+			}
+		}
+		if false && best.offset >= s {
+			panic(fmt.Errorf("t %d >= s %d", best.offset, s))
+		}
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := s - best.offset
+		s += best.length
+
+		if offset > 65535 && s-base <= 5 && !best.rep {
+			// Bail if the match is equal or worse to the encoding.
+			s = best.s + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+		if debug && nextEmit != base {
+			fmt.Println("EMIT", base-nextEmit, "literals. base-after:", base)
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if best.rep {
+			if nextEmit > 0 || best.dict {
+				if debug {
+					fmt.Println("REPEAT, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
+				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], offset, best.length)
+			} else {
+				// First match without dict cannot be a repeat.
+				if debug {
+					fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
+				}
+				d += emitCopy(dst[d:], offset, best.length)
+			}
+		} else {
+			if debug {
+				fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
+			}
+			d += emitCopy(dst[d:], offset, best.length)
+		}
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+		// Fill tables...
+		for i := best.s + 1; i < s; i++ {
+			cv0 := load64(src, i)
+			long0 := hash8(cv0, lTableBits)
+			short0 := hash4(cv0, sTableBits)
+			lTable[long0] = uint64(i) | lTable[long0]<<32
+			sTable[short0] = uint64(i) | sTable[short0]<<32
+		}
+		cv = load64(src, s)
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", len(src)-nextEmit, "literals")
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBestSnappy(dst, src []byte) (d int) {
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 19
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 16
+		maxSTableSize = 1 << sTableBits
+
+		inputMargin = 8 + 2
+	)
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	var lTable [maxLTableSize]uint64
+	var sTable [maxSTableSize]uint64
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+	const lowbitMask = 0xffffffff
+	getCur := func(x uint64) int {
+		return int(x & lowbitMask)
+	}
+	getPrev := func(x uint64) int {
+		return int(x >> 32)
+	}
+	const maxSkip = 64
+
+	for {
+		type match struct {
+			offset int
+			s      int
+			length int
+			score  int
+		}
+		var best match
+		for {
+			// Next src position to check
+			nextS := (s-nextEmit)>>8 + 1
+			if nextS > maxSkip {
+				nextS = s + maxSkip
+			} else {
+				nextS += s
+			}
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash8(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL := lTable[hashL]
+			candidateS := sTable[hashS]
+
+			score := func(m match) int {
+				// Matches that are longer forward are penalized since we must emit it as a literal.
+				score := m.length - m.s
+				if nextEmit == m.s {
+					// If we do not have to emit literals, we save 1 byte
+					score++
+				}
+				offset := m.s - m.offset
+
+				return score - emitCopyNoRepeatSize(offset, m.length)
+			}
+
+			matchAt := func(offset, s int, first uint32) match {
+				if best.length != 0 && best.s-best.offset == s-offset {
+					// Don't retest if we have the same offset.
+					return match{offset: offset, s: s}
+				}
+				if load32(src, offset) != first {
+					return match{offset: offset, s: s}
+				}
+				m := match{offset: offset, s: s, length: 4 + offset}
+				s += 4
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
+						m.length += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					m.length += 8
+				}
+				m.length -= offset
+				m.score = score(m)
+				if m.score <= -m.s {
+					// Eliminate if no savings, we might find a better one.
+					m.length = 0
+				}
+				return m
+			}
+
+			bestOf := func(a, b match) match {
+				if b.length == 0 {
+					return a
+				}
+				if a.length == 0 {
+					return b
+				}
+				as := a.score + b.s
+				bs := b.score + a.s
+				if as >= bs {
+					return a
+				}
+				return b
+			}
+
+			best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv)))
+			best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv)))
+			best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv)))
+
+			{
+				best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
+				if best.length > 0 {
+					// s+1
+					nextShort := sTable[hash4(cv>>8, sTableBits)]
+					s := s + 1
+					cv := load64(src, s)
+					nextLong := lTable[hash8(cv, lTableBits)]
+					best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
+					best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
+					best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
+					best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
+					// Repeat at + 2
+					best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
+
+					// s+2
+					if true {
+						nextShort = sTable[hash4(cv>>8, sTableBits)]
+						s++
+						cv = load64(src, s)
+						nextLong = lTable[hash8(cv, lTableBits)]
+						best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
+						best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
+						best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
+						best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
+					}
+					// Search for a match at best match end, see if that is better.
+					if sAt := best.s + best.length; sAt < sLimit {
+						sBack := best.s
+						backL := best.length
+						// Load initial values
+						cv = load64(src, sBack)
+						// Search for mismatch
+						next := lTable[hash8(load64(src, sAt), lTableBits)]
+						//next := sTable[hash4(load64(src, sAt), sTableBits)]
+
+						if checkAt := getCur(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+						}
+						if checkAt := getPrev(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+						}
+					}
+				}
+			}
+
+			// Update table
+			lTable[hashL] = uint64(s) | candidateL<<32
+			sTable[hashS] = uint64(s) | candidateS<<32
+
+			if best.length > 0 {
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards, not needed for repeats...
+		s = best.s
+		if true {
+			for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
+				best.offset--
+				best.length++
+				s--
+			}
+		}
+		if false && best.offset >= s {
+			panic(fmt.Errorf("t %d >= s %d", best.offset, s))
+		}
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := s - best.offset
+
+		s += best.length
+
+		if offset > 65535 && s-base <= 5 {
+			// Bail if the match is equal or worse to the encoding.
+			s = best.s + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		d += emitCopyNoRepeat(dst[d:], offset, best.length)
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+		// Fill tables...
+		for i := best.s + 1; i < s; i++ {
+			cv0 := load64(src, i)
+			long0 := hash8(cv0, lTableBits)
+			short0 := hash4(cv0, sTableBits)
+			lTable[long0] = uint64(i) | lTable[long0]<<32
+			sTable[short0] = uint64(i) | sTable[short0]<<32
+		}
+		cv = load64(src, s)
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// emitCopySize returns the size to encode the offset+length
+//
+// It assumes that:
+//
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopySize(offset, length int) int {
+	if offset >= 65536 {
+		i := 0
+		if length > 64 {
+			length -= 64
+			if length >= 4 {
+				// Emit remaining as repeats
+				return 5 + emitRepeatSize(offset, length)
+			}
+			i = 5
+		}
+		if length == 0 {
+			return i
+		}
+		return i + 5
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		if offset < 2048 {
+			// Emit 8 bytes, then rest as repeats...
+			return 2 + emitRepeatSize(offset, length-8)
+		}
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return 3 + emitRepeatSize(offset, length-60)
+	}
+	if length >= 12 || offset >= 2048 {
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	return 2
+}
+
+// emitCopyNoRepeatSize returns the size to encode the offset+length
+//
+// It assumes that:
+//
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopyNoRepeatSize(offset, length int) int {
+	if offset >= 65536 {
+		return 5 + 5*(length/64)
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return 3 + 3*(length/60)
+	}
+	if length >= 12 || offset >= 2048 {
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	return 2
+}
+
+// emitRepeatSize returns the number of bytes required to encode a repeat.
+// Length must be at least 4 and < 1<<24
+func emitRepeatSize(offset, length int) int {
+	// Repeat offset, make length cheaper
+	if length <= 4+4 || (length < 8+4 && offset < 2048) {
+		return 2
+	}
+	if length < (1<<8)+4+4 {
+		return 3
+	}
+	if length < (1<<16)+(1<<8)+4 {
+		return 4
+	}
+	const maxRepeat = (1 << 24) - 1
+	length -= (1 << 16) - 4
+	left := 0
+	if length > maxRepeat {
+		left = length - maxRepeat + 4
+	}
+	if left > 0 {
+		return 5 + emitRepeatSize(offset, left)
+	}
+	return 5
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go
new file mode 100644
index 0000000000..544cb1e17b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_better.go
@@ -0,0 +1,1106 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"bytes"
+	"fmt"
+	"math/bits"
+)
+
+// hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4(u uint64, h uint8) uint32 {
+	const prime4bytes = 2654435761
+	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
+}
+
+// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash5(u uint64, h uint8) uint32 {
+	const prime5bytes = 889523592379
+	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
+}
+
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+	const prime7bytes = 58295818150454627
+	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
+}
+
+// hash8 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash8(u uint64, h uint8) uint32 {
+	const prime8bytes = 0xcf1bbcdcb7a56463
+	return uint32((u * prime8bytes) >> ((64 - h) & 63))
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterGo(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 17
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 14
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable [maxLTableSize]uint32
+	var sTable [maxSTableSize]uint32
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := 0
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>7 + 1
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if cv == valLong {
+				break
+			}
+			if cv == valShort {
+				candidateL = candidateS
+				break
+			}
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			// Minimum length of a repeat. Tested with various values.
+			// While 4-5 offers improvements in some, 6 reduces
+			// regressions significantly.
+			const wantRepeatBytes = 6
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + wantRepeatBytes + checkRep
+				s += wantRepeatBytes + checkRep
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[candidate] {
+							s++
+							candidate++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				// Index in-between
+				index0 := base + 1
+				index1 := s - 2
+
+				for index0 < index1 {
+					cv0 := load64(src, index0)
+					cv1 := load64(src, index1)
+					lTable[hash7(cv0, lTableBits)] = uint32(index0)
+					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+					lTable[hash7(cv1, lTableBits)] = uint32(index1)
+					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+					index0 += 2
+					index1 -= 2
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if repeat == offset {
+			d += emitRepeat(dst[d:], offset, s-base)
+		} else {
+			d += emitCopy(dst[d:], offset, s-base)
+			repeat = offset
+		}
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+		// lTable could be postponed, but very minor difference.
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 16
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 14
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable [maxLTableSize]uint32
+	var sTable [maxSTableSize]uint32
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := 0
+	const maxSkip = 100
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = (s-nextEmit)>>7 + 1
+			if nextS > maxSkip {
+				nextS = s + maxSkip
+			} else {
+				nextS += s
+			}
+
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			if uint32(cv) == load32(src, candidateL) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == load32(src, candidateS) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		d += emitCopyNoRepeat(dst[d:], offset, s-base)
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterDict(dst, src []byte, dict *Dict) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 17
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 14
+		maxSTableSize = 1 << sTableBits
+
+		maxAhead = 8 // maximum bytes ahead without checking sLimit
+
+		debug = false
+	)
+
+	sLimit := len(src) - inputMargin
+	if sLimit > MaxDictSrcOffset-maxAhead {
+		sLimit = MaxDictSrcOffset - maxAhead
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	dict.initBetter()
+
+	var lTable [maxLTableSize]uint32
+	var sTable [maxSTableSize]uint32
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 0
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := len(dict.dict) - dict.repeat
+
+	// While in dict
+searchDict:
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>7 + 1
+			if nextS > sLimit {
+				break searchDict
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			dictL := int(dict.betterTableLong[hashL])
+			dictS := int(dict.betterTableShort[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if s != 0 {
+				if cv == valLong {
+					goto emitMatch
+				}
+				if cv == valShort {
+					candidateL = candidateS
+					goto emitMatch
+				}
+			}
+
+			// Check dict repeat.
+			if repeat >= s+4 {
+				candidate := len(dict.dict) - repeat + s
+				if candidate > 0 && uint32(cv) == load32(dict.dict, candidate) {
+					// Extend back
+					base := s
+					for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
+						i--
+						base--
+					}
+					d += emitLiteral(dst[d:], src[nextEmit:base])
+					if debug && nextEmit != base {
+						fmt.Println("emitted ", base-nextEmit, "literals")
+					}
+					s += 4
+					candidate += 4
+					for candidate < len(dict.dict)-8 && s <= len(src)-8 {
+						if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
+							s += bits.TrailingZeros64(diff) >> 3
+							break
+						}
+						s += 8
+						candidate += 8
+					}
+					d += emitRepeat(dst[d:], repeat, s-base)
+					if debug {
+						fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
+					}
+					nextEmit = s
+					if s >= sLimit {
+						break searchDict
+					}
+					// Index in-between
+					index0 := base + 1
+					index1 := s - 2
+
+					cv = load64(src, s)
+					for index0 < index1 {
+						cv0 := load64(src, index0)
+						cv1 := load64(src, index1)
+						lTable[hash7(cv0, lTableBits)] = uint32(index0)
+						sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+						lTable[hash7(cv1, lTableBits)] = uint32(index1)
+						sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+						index0 += 2
+						index1 -= 2
+					}
+					continue
+				}
+			}
+			// Don't try to find match at s==0
+			if s == 0 {
+				cv = load64(src, nextS)
+				s = nextS
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
+				goto emitMatch
+			}
+
+			// Long dict...
+			if uint32(cv) == load32(dict.dict, dictL) {
+				candidateL = dictL
+				goto emitDict
+			}
+
+			// Check our short candidate
+			if uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					goto emitMatch
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				goto emitMatch
+			}
+			if uint32(cv) == load32(dict.dict, dictS) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					goto emitMatch
+				}
+				candidateL = dictS
+				goto emitDict
+			}
+			cv = load64(src, nextS)
+			s = nextS
+		}
+	emitDict:
+		{
+			if debug {
+				if load32(dict.dict, candidateL) != load32(src, s) {
+					panic("dict emit mismatch")
+				}
+			}
+			// Extend backwards.
+			// The top bytes will be rechecked to get the full match.
+			for candidateL > 0 && s > nextEmit && dict.dict[candidateL-1] == src[s-1] {
+				candidateL--
+				s--
+			}
+
+			// Bail if we exceed the maximum size.
+			if d+(s-nextEmit) > dstLimit {
+				return 0
+			}
+
+			// A 4-byte match has been found. We'll later see if more than 4 bytes
+			// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+			// them as literal bytes.
+
+			d += emitLiteral(dst[d:], src[nextEmit:s])
+			if debug && nextEmit != s {
+				fmt.Println("emitted ", s-nextEmit, "literals")
+			}
+			{
+				// Invariant: we have a 4-byte match at s, and no need to emit any
+				// literal bytes prior to s.
+				base := s
+				offset := s + (len(dict.dict)) - candidateL
+
+				// Extend the 4-byte match as long as possible.
+				s += 4
+				candidateL += 4
+				for s <= len(src)-8 && len(dict.dict)-candidateL >= 8 {
+					if diff := load64(src, s) ^ load64(dict.dict, candidateL); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidateL += 8
+				}
+
+				if repeat == offset {
+					if debug {
+						fmt.Println("emitted dict repeat, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
+					}
+					d += emitRepeat(dst[d:], offset, s-base)
+				} else {
+					if debug {
+						fmt.Println("emitted dict copy, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
+					}
+					// Matches longer than 64 are split.
+					if s <= sLimit || s-base < 8 {
+						d += emitCopy(dst[d:], offset, s-base)
+					} else {
+						// Split to ensure we don't start a copy within next block.
+						d += emitCopy(dst[d:], offset, 4)
+						d += emitRepeat(dst[d:], offset, s-base-4)
+					}
+					repeat = offset
+				}
+				if false {
+					// Validate match.
+					if s <= candidateL {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := dict.dict[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+
+				nextEmit = s
+				if s >= sLimit {
+					break searchDict
+				}
+
+				if d > dstLimit {
+					// Do we have space for more, if not bail.
+					return 0
+				}
+
+				// Index short & long
+				index0 := base + 1
+				index1 := s - 2
+
+				cv0 := load64(src, index0)
+				cv1 := load64(src, index1)
+				lTable[hash7(cv0, lTableBits)] = uint32(index0)
+				sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+				lTable[hash7(cv1, lTableBits)] = uint32(index1)
+				sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+				index0 += 1
+				index1 -= 1
+				cv = load64(src, s)
+
+				// index every second long in between.
+				for index0 < index1 {
+					lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+					lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+					index0 += 2
+					index1 -= 2
+				}
+			}
+			continue
+		}
+	emitMatch:
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", s-nextEmit, "literals")
+		}
+		if repeat == offset {
+			if debug {
+				fmt.Println("emitted match repeat, length", s-base, "offset:", offset, "s:", s)
+			}
+			d += emitRepeat(dst[d:], offset, s-base)
+		} else {
+			if debug {
+				fmt.Println("emitted match copy, length", s-base, "offset:", offset, "s:", s)
+			}
+			d += emitCopy(dst[d:], offset, s-base)
+			repeat = offset
+		}
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+	// Search without dict:
+	if repeat > s {
+		repeat = 0
+	}
+
+	// No more dict
+	sLimit = len(src) - inputMargin
+	if s >= sLimit {
+		goto emitRemainder
+	}
+	cv = load64(src, s)
+	if debug {
+		fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
+	}
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>7 + 1
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if cv == valLong {
+				break
+			}
+			if cv == valShort {
+				candidateL = candidateS
+				break
+			}
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			// Minimum length of a repeat. Tested with various values.
+			// While 4-5 offers improvements in some, 6 reduces
+			// regressions significantly.
+			const wantRepeatBytes = 6
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + wantRepeatBytes + checkRep
+				s += wantRepeatBytes + checkRep
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[candidate] {
+							s++
+							candidate++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				// Index in-between
+				index0 := base + 1
+				index1 := s - 2
+
+				for index0 < index1 {
+					cv0 := load64(src, index0)
+					cv1 := load64(src, index1)
+					lTable[hash7(cv0, lTableBits)] = uint32(index0)
+					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+					lTable[hash7(cv1, lTableBits)] = uint32(index1)
+					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+					index0 += 2
+					index1 -= 2
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if repeat == offset {
+			d += emitRepeat(dst[d:], offset, s-base)
+		} else {
+			d += emitCopy(dst[d:], offset, s-base)
+			repeat = offset
+		}
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go
new file mode 100644
index 0000000000..dd1c973ca5
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@@ -0,0 +1,729 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package s2
+
+import (
+	"bytes"
+	"math/bits"
+)
+
+const hasAmd64Asm = false
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlock(dst, src []byte) (d int) {
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeBlockGo(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockBetter(dst, src []byte) (d int) {
+	return encodeBlockBetterGo(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockBetterSnappy(dst, src []byte) (d int) {
+	return encodeBlockBetterSnappyGo(dst, src)
+}
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockSnappy(dst, src []byte) (d int) {
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeBlockSnappyGo(dst, src)
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
+func emitLiteral(dst, lit []byte) int {
+	if len(lit) == 0 {
+		return 0
+	}
+	const num = 63<<2 | tagLiteral
+	i, n := 0, uint(len(lit)-1)
+	switch {
+	case n < 60:
+		dst[0] = uint8(n)<<2 | tagLiteral
+		i = 1
+	case n < 1<<8:
+		dst[1] = uint8(n)
+		dst[0] = 60<<2 | tagLiteral
+		i = 2
+	case n < 1<<16:
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 61<<2 | tagLiteral
+		i = 3
+	case n < 1<<24:
+		dst[3] = uint8(n >> 16)
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 62<<2 | tagLiteral
+		i = 4
+	default:
+		dst[4] = uint8(n >> 24)
+		dst[3] = uint8(n >> 16)
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 63<<2 | tagLiteral
+		i = 5
+	}
+	return i + copy(dst[i:], lit)
+}
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<24
+func emitRepeat(dst []byte, offset, length int) int {
+	// Repeat offset, make length cheaper
+	length -= 4
+	if length <= 4 {
+		dst[0] = uint8(length)<<2 | tagCopy1
+		dst[1] = 0
+		return 2
+	}
+	if length < 8 && offset < 2048 {
+		// Encode WITH offset
+		dst[1] = uint8(offset)
+		dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+		return 2
+	}
+	if length < (1<<8)+4 {
+		length -= 4
+		dst[2] = uint8(length)
+		dst[1] = 0
+		dst[0] = 5<<2 | tagCopy1
+		return 3
+	}
+	if length < (1<<16)+(1<<8) {
+		length -= 1 << 8
+		dst[3] = uint8(length >> 8)
+		dst[2] = uint8(length >> 0)
+		dst[1] = 0
+		dst[0] = 6<<2 | tagCopy1
+		return 4
+	}
+	const maxRepeat = (1 << 24) - 1
+	length -= 1 << 16
+	left := 0
+	if length > maxRepeat {
+		left = length - maxRepeat + 4
+		length = maxRepeat - 4
+	}
+	dst[4] = uint8(length >> 16)
+	dst[3] = uint8(length >> 8)
+	dst[2] = uint8(length >> 0)
+	dst[1] = 0
+	dst[0] = 7<<2 | tagCopy1
+	if left > 0 {
+		return 5 + emitRepeat(dst[5:], offset, left)
+	}
+	return 5
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopy(dst []byte, offset, length int) int {
+	if offset >= 65536 {
+		i := 0
+		if length > 64 {
+			// Emit a length 64 copy, encoded as 5 bytes.
+			dst[4] = uint8(offset >> 24)
+			dst[3] = uint8(offset >> 16)
+			dst[2] = uint8(offset >> 8)
+			dst[1] = uint8(offset)
+			dst[0] = 63<<2 | tagCopy4
+			length -= 64
+			if length >= 4 {
+				// Emit remaining as repeats
+				return 5 + emitRepeat(dst[5:], offset, length)
+			}
+			i = 5
+		}
+		if length == 0 {
+			return i
+		}
+		// Emit a copy, offset encoded as 4 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy4
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		dst[i+3] = uint8(offset >> 16)
+		dst[i+4] = uint8(offset >> 24)
+		return i + 5
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		off := 3
+		if offset < 2048 {
+			// emit 8 bytes as tagCopy1, rest as repeats.
+			dst[1] = uint8(offset)
+			dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+			length -= 8
+			off = 2
+		} else {
+			// Emit a length 60 copy, encoded as 3 bytes.
+			// Emit remaining as repeat value (minimum 4 bytes).
+			dst[2] = uint8(offset >> 8)
+			dst[1] = uint8(offset)
+			dst[0] = 59<<2 | tagCopy2
+			length -= 60
+		}
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return off + emitRepeat(dst[off:], offset, length)
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = uint8(length-1)<<2 | tagCopy2
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[1] = uint8(offset)
+	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	return 2
+}
+
+// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopyNoRepeat(dst []byte, offset, length int) int {
+	if offset >= 65536 {
+		i := 0
+		if length > 64 {
+			// Emit a length 64 copy, encoded as 5 bytes.
+			dst[4] = uint8(offset >> 24)
+			dst[3] = uint8(offset >> 16)
+			dst[2] = uint8(offset >> 8)
+			dst[1] = uint8(offset)
+			dst[0] = 63<<2 | tagCopy4
+			length -= 64
+			if length >= 4 {
+				// Emit remaining as repeats
+				return 5 + emitCopyNoRepeat(dst[5:], offset, length)
+			}
+			i = 5
+		}
+		if length == 0 {
+			return i
+		}
+		// Emit a copy, offset encoded as 4 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy4
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		dst[i+3] = uint8(offset >> 16)
+		dst[i+4] = uint8(offset >> 24)
+		return i + 5
+	}
+
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		// Emit remaining as repeat value (minimum 4 bytes).
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = 59<<2 | tagCopy2
+		length -= 60
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return 3 + emitCopyNoRepeat(dst[3:], offset, length)
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = uint8(length-1)<<2 | tagCopy2
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[1] = uint8(offset)
+	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	return 2
+}
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+//
+//	len(a) <= len(b)
+func matchLen(a []byte, b []byte) int {
+	b = b[:len(a)]
+	var checked int
+	if len(a) > 4 {
+		// Try 4 bytes first
+		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
+			return bits.TrailingZeros32(diff) >> 3
+		}
+		// Switch to 8 byte matching.
+		checked = 4
+		a = a[4:]
+		b = b[4:]
+		for len(a) >= 8 {
+			b = b[:len(a)]
+			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
+				return checked + (bits.TrailingZeros64(diff) >> 3)
+			}
+			checked += 8
+			a = a[8:]
+			b = b[8:]
+		}
+	}
+	b = b[:len(a)]
+	for i := range a {
+		if a[i] != b[i] {
+			return int(i) + checked
+		}
+	}
+	return len(a) + checked
+}
+
+// input must be > inputMargin
+func calcBlockSize(src []byte, _ *[32768]byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 13
+		maxTableSize = 1 << tableBits
+	)
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteralSize(src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+
+				d += emitCopyNoRepeatSize(repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteralSize(src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopyNoRepeatSize(repeat, s-base)
+			if false {
+				// Validate match.
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteralSize(src[nextEmit:])
+	}
+	return d
+}
+
+// length must be > inputMargin.
+func calcBlockSizeSmall(src []byte, _ *[2048]byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 9
+		maxTableSize = 1 << tableBits
+	)
+
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>6 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteralSize(src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+
+				d += emitCopyNoRepeatSize(repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteralSize(src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopyNoRepeatSize(repeat, s-base)
+			if false {
+				// Validate match.
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteralSize(src[nextEmit:])
+	}
+	return d
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
+func emitLiteralSize(lit []byte) int {
+	if len(lit) == 0 {
+		return 0
+	}
+	switch {
+	case len(lit) <= 60:
+		return len(lit) + 1
+	case len(lit) <= 1<<8:
+		return len(lit) + 2
+	case len(lit) <= 1<<16:
+		return len(lit) + 3
+	case len(lit) <= 1<<24:
+		return len(lit) + 4
+	default:
+		return len(lit) + 5
+	}
+}
+
+func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+	panic("cvtLZ4BlockAsm should be unreachable")
+}
+
+func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+	panic("cvtLZ4BlockSnappyAsm should be unreachable")
+}
+
+func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+	panic("cvtLZ4sBlockAsm should be unreachable")
+}
+
+func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+	panic("cvtLZ4sBlockSnappyAsm should be unreachable")
+}
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
new file mode 100644
index 0000000000..f43aa81543
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
@@ -0,0 +1,228 @@
+// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+
+package s2
+
+func _dummy_()
+
+// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int
+
+// encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4194304 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm4MB(dst []byte, src []byte, tmp *[65536]byte) int
+
+// encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int
+
+// encodeBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int
+
+// encodeBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int
+
+// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int
+
+// encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4194304 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm4MB(dst []byte, src []byte, tmp *[589824]byte) int
+
+// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int
+
+// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int
+
+// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int
+
+// encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int
+
+// encodeSnappyBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 65535 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm64K(dst []byte, src []byte, tmp *[65536]byte) int
+
+// encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int
+
+// encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int
+
+// encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int
+
+// encodeSnappyBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int
+
+// encodeSnappyBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 65535 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte, tmp *[294912]byte) int
+
+// encodeSnappyBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16383 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int
+
+// encodeSnappyBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4095 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int
+
+// encodeSnappyBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 511 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int
+
+// calcBlockSize encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4294967295 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func calcBlockSize(src []byte, tmp *[32768]byte) int
+
+// calcBlockSizeSmall encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 1024 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func calcBlockSizeSmall(src []byte, tmp *[2048]byte) int
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes with margin of 0 bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
+//
+//go:noescape
+func emitLiteral(dst []byte, lit []byte) int
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<32
+//
+//go:noescape
+func emitRepeat(dst []byte, offset int, length int) int
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+//
+//go:noescape
+func emitCopy(dst []byte, offset int, length int) int
+
+// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+//
+//go:noescape
+func emitCopyNoRepeat(dst []byte, offset int, length int) int
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+//
+//	len(a) <= len(b)
+//
+//go:noescape
+func matchLen(a []byte, b []byte) int
+
+// cvtLZ4Block converts an LZ4 block to S2
+//
+//go:noescape
+func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+
+// cvtLZ4sBlock converts an LZ4s block to S2
+//
+//go:noescape
+func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+
+// cvtLZ4Block converts an LZ4 block to Snappy
+//
+//go:noescape
+func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+
+// cvtLZ4sBlock converts an LZ4s block to Snappy
+//
+//go:noescape
+func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
new file mode 100644
index 0000000000..df9be687be
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -0,0 +1,21303 @@
+// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+
+#include "textflag.h"
+
+// func _dummy_()
+TEXT ·_dummy_(SB), $0
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+	RET
+
+// func encodeBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBlockAsm(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000200, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R11
+	IMULQ R9, R11
+	SHRQ  $0x32, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm
+	LEAL  1(DX), DI
+	MOVL  12(SP), R8
+	MOVL  DI, SI
+	SUBL  16(SP), SI
+	JZ    repeat_extend_back_end_encodeBlockAsm
+
+repeat_extend_back_loop_encodeBlockAsm:
+	CMPL DI, R8
+	JBE  repeat_extend_back_end_encodeBlockAsm
+	MOVB -1(BX)(SI*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm
+	LEAL -1(DI), DI
+	DECL SI
+	JNZ  repeat_extend_back_loop_encodeBlockAsm
+
+repeat_extend_back_end_encodeBlockAsm:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 5(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBlockAsm:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_encodeBlockAsm
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBlockAsm
+	CMPL SI, $0x00010000
+	JB   three_bytes_repeat_emit_encodeBlockAsm
+	CMPL SI, $0x01000000
+	JB   four_bytes_repeat_emit_encodeBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x05, CX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+four_bytes_repeat_emit_encodeBlockAsm:
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R11, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+three_bytes_repeat_emit_encodeBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+two_bytes_repeat_emit_encodeBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_encodeBlockAsm
+	JMP  memmove_long_repeat_emit_encodeBlockAsm
+
+one_byte_repeat_emit_encodeBlockAsm:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_encodeBlockAsm:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm
+
+memmove_long_repeat_emit_encodeBlockAsm:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R13*1), R11
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R13*1), X4
+	MOVOU -16(R10)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R9, R13
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeBlockAsm:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R9
+	SUBL DX, R9
+	LEAQ (BX)(DX*1), R10
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm:
+	CMPL R9, $0x10
+	JB   matchlen_match8_repeat_extend_encodeBlockAsm
+	MOVQ (R10)(R12*1), R11
+	MOVQ 8(R10)(R12*1), R13
+	XORQ (SI)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm
+	XORQ 8(SI)(R12*1), R13
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm
+	LEAL -16(R9), R9
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_match8_repeat_extend_encodeBlockAsm:
+	CMPL R9, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm
+	MOVQ (R10)(R12*1), R11
+	XORQ (SI)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm
+	LEAL -8(R9), R9
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_match4_repeat_extend_encodeBlockAsm:
+	CMPL R9, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm
+	MOVL (R10)(R12*1), R11
+	CMPL (SI)(R12*1), R11
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm
+	LEAL -4(R9), R9
+	LEAL 4(R12), R12
+
+matchlen_match2_repeat_extend_encodeBlockAsm:
+	CMPL R9, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm
+	JB   repeat_extend_forward_end_encodeBlockAsm
+	MOVW (R10)(R12*1), R11
+	CMPW (SI)(R12*1), R11
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm
+	LEAL 2(R12), R12
+	SUBL $0x02, R9
+	JZ   repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_match1_repeat_extend_encodeBlockAsm:
+	MOVB (R10)(R12*1), R11
+	CMPB (SI)(R12*1), R11
+	JNE  repeat_extend_forward_end_encodeBlockAsm
+	LEAL 1(R12), R12
+
+repeat_extend_forward_end_encodeBlockAsm:
+	ADDL  R12, DX
+	MOVL  DX, SI
+	SUBL  DI, SI
+	MOVL  16(SP), DI
+	TESTL R8, R8
+	JZ    repeat_as_copy_encodeBlockAsm
+
+	// emitRepeat
+emit_repeat_again_match_repeat_encodeBlockAsm:
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_match_repeat_encodeBlockAsm
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_repeat_encodeBlockAsm
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm:
+	CMPL SI, $0x00000104
+	JB   repeat_three_match_repeat_encodeBlockAsm
+	CMPL SI, $0x00010100
+	JB   repeat_four_match_repeat_encodeBlockAsm
+	CMPL SI, $0x0100ffff
+	JB   repeat_five_match_repeat_encodeBlockAsm
+	LEAL -16842747(SI), SI
+	MOVL $0xfffb001d, (CX)
+	MOVB $0xff, 4(CX)
+	ADDQ $0x05, CX
+	JMP  emit_repeat_again_match_repeat_encodeBlockAsm
+
+repeat_five_match_repeat_encodeBlockAsm:
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (CX)
+	MOVW SI, 2(CX)
+	SARL $0x10, DI
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_four_match_repeat_encodeBlockAsm:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_match_repeat_encodeBlockAsm:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_match_repeat_encodeBlockAsm:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_match_repeat_encodeBlockAsm:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_as_copy_encodeBlockAsm:
+	// emitCopy
+	CMPL DI, $0x00010000
+	JB   two_byte_offset_repeat_as_copy_encodeBlockAsm
+	CMPL SI, $0x40
+	JBE  four_bytes_remain_repeat_as_copy_encodeBlockAsm
+	MOVB $0xff, (CX)
+	MOVL DI, 1(CX)
+	LEAL -64(SI), SI
+	ADDQ $0x05, CX
+	CMPL SI, $0x04
+	JB   four_bytes_remain_repeat_as_copy_encodeBlockAsm
+
+	// emitRepeat
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL SI, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
+	CMPL SI, $0x0100ffff
+	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
+	LEAL -16842747(SI), SI
+	MOVL $0xfffb001d, (CX)
+	MOVB $0xff, 4(CX)
+	ADDQ $0x05, CX
+	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (CX)
+	MOVW SI, 2(CX)
+	SARL $0x10, DI
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+four_bytes_remain_repeat_as_copy_encodeBlockAsm:
+	TESTL SI, SI
+	JZ    repeat_end_emit_encodeBlockAsm
+	XORL  R8, R8
+	LEAL  -1(R8)(SI*4), SI
+	MOVB  SI, (CX)
+	MOVL  DI, 1(CX)
+	ADDQ  $0x05, CX
+	JMP   repeat_end_emit_encodeBlockAsm
+
+two_byte_offset_repeat_as_copy_encodeBlockAsm:
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm
+	CMPL DI, $0x00000800
+	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm
+	MOVL $0x00000001, R8
+	LEAL 16(R8), R8
+	MOVB DI, 1(CX)
+	MOVL DI, R9
+	SHRL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, SI
+
+	// emitRepeat
+	LEAL -4(SI), SI
+	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+	CMPL SI, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+	CMPL SI, $0x0100ffff
+	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+	LEAL -16842747(SI), SI
+	MOVL $0xfffb001d, (CX)
+	MOVB $0xff, 4(CX)
+	ADDQ $0x05, CX
+	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (CX)
+	MOVW SI, 2(CX)
+	SARL $0x10, DI
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+long_offset_short_repeat_as_copy_encodeBlockAsm:
+	MOVB $0xee, (CX)
+	MOVW DI, 1(CX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+
+	// emitRepeat
+emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL SI, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	CMPL SI, $0x0100ffff
+	JB   repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
+	LEAL -16842747(SI), SI
+	MOVL $0xfffb001d, (CX)
+	MOVB $0xff, 4(CX)
+	ADDQ $0x05, CX
+	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
+
+repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (CX)
+	MOVW SI, 2(CX)
+	SARL $0x10, DI
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
+	MOVL SI, R8
+	SHLL $0x02, R8
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm
+	LEAL -15(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm:
+	LEAL -2(R8), R8
+	MOVB R8, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+
+repeat_end_emit_encodeBlockAsm:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm
+
+no_repeat_found_encodeBlockAsm:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBlockAsm
+
+candidate3_match_encodeBlockAsm:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm
+
+candidate2_match_encodeBlockAsm:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm
+
+match_extend_back_loop_encodeBlockAsm:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm
+	JMP  match_extend_back_loop_encodeBlockAsm
+
+match_extend_back_end_encodeBlockAsm:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 5(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JB   one_byte_match_emit_encodeBlockAsm
+	CMPL R8, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm
+	CMPL R8, $0x00010000
+	JB   three_bytes_match_emit_encodeBlockAsm
+	CMPL R8, $0x01000000
+	JB   four_bytes_match_emit_encodeBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL R8, 1(CX)
+	ADDQ $0x05, CX
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+four_bytes_match_emit_encodeBlockAsm:
+	MOVL R8, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R8, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+three_bytes_match_emit_encodeBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+two_bytes_match_emit_encodeBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_match_emit_encodeBlockAsm
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+one_byte_match_emit_encodeBlockAsm:
+	SHLB $0x02, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeBlockAsm:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (CX)
+	MOVQ DI, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm
+
+memmove_long_match_emit_encodeBlockAsm:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_match_emit_encodeBlockAsm:
+match_nolit_loop_encodeBlockAsm:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBlockAsm
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm
+
+matchlen_bsf_16match_nolit_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_encodeBlockAsm
+
+matchlen_match8_match_nolit_encodeBlockAsm:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_encodeBlockAsm
+
+matchlen_match4_match_nolit_encodeBlockAsm:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeBlockAsm:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm
+	JB   match_nolit_end_encodeBlockAsm
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm
+
+matchlen_match1_match_nolit_encodeBlockAsm:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeBlockAsm
+	LEAL 1(R10), R10
+
+match_nolit_end_encodeBlockAsm:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL SI, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeBlockAsm
+	CMPL R10, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeBlockAsm
+	MOVB $0xff, (CX)
+	MOVL SI, 1(CX)
+	LEAL -64(R10), R10
+	ADDQ $0x05, CX
+	CMPL R10, $0x04
+	JB   four_bytes_remain_match_nolit_encodeBlockAsm
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy
+	CMPL R10, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (CX)
+	MOVB $0xff, 4(CX)
+	ADDQ $0x05, CX
+	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy:
+	LEAL -65536(R10), R10
+	MOVL R10, SI
+	MOVW $0x001d, (CX)
+	MOVW R10, 2(CX)
+	SARL $0x10, SI
+	MOVB SI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+four_bytes_remain_match_nolit_encodeBlockAsm:
+	TESTL R10, R10
+	JZ    match_nolit_emitcopy_end_encodeBlockAsm
+	XORL  DI, DI
+	LEAL  -1(DI)(R10*4), R10
+	MOVB  R10, (CX)
+	MOVL  SI, 1(CX)
+	ADDQ  $0x05, CX
+	JMP   match_nolit_emitcopy_end_encodeBlockAsm
+
+two_byte_offset_match_nolit_encodeBlockAsm:
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm
+	CMPL SI, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBlockAsm
+	MOVL $0x00000001, DI
+	LEAL 16(DI), DI
+	MOVB SI, 1(CX)
+	MOVL SI, R8
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, R10
+
+	// emitRepeat
+	LEAL -4(R10), R10
+	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
+
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
+	CMPL R10, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (CX)
+	MOVB $0xff, 4(CX)
+	ADDQ $0x05, CX
+	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -65536(R10), R10
+	MOVL R10, SI
+	MOVW $0x001d, (CX)
+	MOVW R10, 2(CX)
+	SARL $0x10, SI
+	MOVB SI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+long_offset_short_match_nolit_encodeBlockAsm:
+	MOVB $0xee, (CX)
+	MOVW SI, 1(CX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL R10, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (CX)
+	MOVB $0xff, 4(CX)
+	ADDQ $0x05, CX
+	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
+
+repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
+	LEAL -65536(R10), R10
+	MOVL R10, SI
+	MOVW $0x001d, (CX)
+	MOVW R10, 2(CX)
+	SARL $0x10, SI
+	MOVB SI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+two_byte_offset_short_match_nolit_encodeBlockAsm:
+	MOVL R10, DI
+	SHLL $0x02, DI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm
+	LEAL -15(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy_three_match_nolit_encodeBlockAsm:
+	LEAL -2(DI), DI
+	MOVB DI, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm:
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x10, R8
+	IMULQ R9, R8
+	SHRQ  $0x32, R8
+	SHLQ  $0x10, SI
+	IMULQ R9, SI
+	SHRQ  $0x32, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeBlockAsm
+	INCL  DX
+	JMP   search_loop_encodeBlockAsm
+
+emit_remainder_encodeBlockAsm:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 5(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBlockAsm
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBlockAsm
+	CMPL DX, $0x01000000
+	JB   four_bytes_emit_remainder_encodeBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL DX, 1(CX)
+	ADDQ $0x05, CX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+four_bytes_emit_remainder_encodeBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (CX)
+	MOVW DX, 1(CX)
+	MOVB BL, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+three_bytes_emit_remainder_encodeBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+two_bytes_emit_remainder_encodeBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBlockAsm
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+one_byte_emit_remainder_encodeBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeBlockAsm:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm
+
+memmove_long_emit_remainder_encodeBlockAsm:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBlockAsm:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBlockAsm4MB(dst []byte, src []byte, tmp *[65536]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBlockAsm4MB(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000200, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm4MB:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm4MB
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm4MB:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm4MB
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R11
+	IMULQ R9, R11
+	SHRQ  $0x32, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm4MB
+	LEAL  1(DX), DI
+	MOVL  12(SP), R8
+	MOVL  DI, SI
+	SUBL  16(SP), SI
+	JZ    repeat_extend_back_end_encodeBlockAsm4MB
+
+repeat_extend_back_loop_encodeBlockAsm4MB:
+	CMPL DI, R8
+	JBE  repeat_extend_back_end_encodeBlockAsm4MB
+	MOVB -1(BX)(SI*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm4MB
+	LEAL -1(DI), DI
+	DECL SI
+	JNZ  repeat_extend_back_loop_encodeBlockAsm4MB
+
+repeat_extend_back_end_encodeBlockAsm4MB:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 4(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeBlockAsm4MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBlockAsm4MB:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm4MB
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_encodeBlockAsm4MB
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBlockAsm4MB
+	CMPL SI, $0x00010000
+	JB   three_bytes_repeat_emit_encodeBlockAsm4MB
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R11, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
+
+three_bytes_repeat_emit_encodeBlockAsm4MB:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
+
+two_bytes_repeat_emit_encodeBlockAsm4MB:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_encodeBlockAsm4MB
+	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
+
+one_byte_repeat_emit_encodeBlockAsm4MB:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_encodeBlockAsm4MB:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm4MB
+
+memmove_long_repeat_emit_encodeBlockAsm4MB:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R13*1), R11
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R13*1), X4
+	MOVOU -16(R10)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R9, R13
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeBlockAsm4MB:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R9
+	SUBL DX, R9
+	LEAQ (BX)(DX*1), R10
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB:
+	CMPL R9, $0x10
+	JB   matchlen_match8_repeat_extend_encodeBlockAsm4MB
+	MOVQ (R10)(R12*1), R11
+	MOVQ 8(R10)(R12*1), R13
+	XORQ (SI)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
+	XORQ 8(SI)(R12*1), R13
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm4MB
+	LEAL -16(R9), R9
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_match8_repeat_extend_encodeBlockAsm4MB:
+	CMPL R9, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm4MB
+	MOVQ (R10)(R12*1), R11
+	XORQ (SI)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
+	LEAL -8(R9), R9
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm4MB
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_match4_repeat_extend_encodeBlockAsm4MB:
+	CMPL R9, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm4MB
+	MOVL (R10)(R12*1), R11
+	CMPL (SI)(R12*1), R11
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm4MB
+	LEAL -4(R9), R9
+	LEAL 4(R12), R12
+
+matchlen_match2_repeat_extend_encodeBlockAsm4MB:
+	CMPL R9, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm4MB
+	JB   repeat_extend_forward_end_encodeBlockAsm4MB
+	MOVW (R10)(R12*1), R11
+	CMPW (SI)(R12*1), R11
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm4MB
+	LEAL 2(R12), R12
+	SUBL $0x02, R9
+	JZ   repeat_extend_forward_end_encodeBlockAsm4MB
+
+matchlen_match1_repeat_extend_encodeBlockAsm4MB:
+	MOVB (R10)(R12*1), R11
+	CMPB (SI)(R12*1), R11
+	JNE  repeat_extend_forward_end_encodeBlockAsm4MB
+	LEAL 1(R12), R12
+
+repeat_extend_forward_end_encodeBlockAsm4MB:
+	ADDL  R12, DX
+	MOVL  DX, SI
+	SUBL  DI, SI
+	MOVL  16(SP), DI
+	TESTL R8, R8
+	JZ    repeat_as_copy_encodeBlockAsm4MB
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_match_repeat_encodeBlockAsm4MB
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_repeat_encodeBlockAsm4MB
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
+	CMPL SI, $0x00000104
+	JB   repeat_three_match_repeat_encodeBlockAsm4MB
+	CMPL SI, $0x00010100
+	JB   repeat_four_match_repeat_encodeBlockAsm4MB
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (CX)
+	MOVW SI, 2(CX)
+	SARL $0x10, DI
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_match_repeat_encodeBlockAsm4MB:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_match_repeat_encodeBlockAsm4MB:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_match_repeat_encodeBlockAsm4MB:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_match_repeat_encodeBlockAsm4MB:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_as_copy_encodeBlockAsm4MB:
+	// emitCopy
+	CMPL DI, $0x00010000
+	JB   two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
+	CMPL SI, $0x40
+	JBE  four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
+	MOVB $0xff, (CX)
+	MOVL DI, 1(CX)
+	LEAL -64(SI), SI
+	ADDQ $0x05, CX
+	CMPL SI, $0x04
+	JB   four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	CMPL SI, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (CX)
+	MOVW SI, 2(CX)
+	SARL $0x10, DI
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
+	TESTL SI, SI
+	JZ    repeat_end_emit_encodeBlockAsm4MB
+	XORL  R8, R8
+	LEAL  -1(R8)(SI*4), SI
+	MOVB  SI, (CX)
+	MOVL  DI, 1(CX)
+	ADDQ  $0x05, CX
+	JMP   repeat_end_emit_encodeBlockAsm4MB
+
+two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
+	CMPL DI, $0x00000800
+	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm4MB
+	MOVL $0x00000001, R8
+	LEAL 16(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, SI
+
+	// emitRepeat
+	LEAL -4(SI), SI
+	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL SI, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (CX)
+	MOVW SI, 2(CX)
+	SARL $0x10, DI
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
+	MOVB $0xee, (CX)
+	MOVW DI, 1(CX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	CMPL SI, $0x00010100
+	JB   repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
+	LEAL -65536(SI), SI
+	MOVL SI, DI
+	MOVW $0x001d, (CX)
+	MOVW SI, 2(CX)
+	SARL $0x10, DI
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
+	MOVL SI, R8
+	SHLL $0x02, R8
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
+	LEAL -15(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm4MB
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
+	LEAL -2(R8), R8
+	MOVB R8, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+
+repeat_end_emit_encodeBlockAsm4MB:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm4MB
+
+no_repeat_found_encodeBlockAsm4MB:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm4MB
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm4MB
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm4MB
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBlockAsm4MB
+
+candidate3_match_encodeBlockAsm4MB:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm4MB
+
+candidate2_match_encodeBlockAsm4MB:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm4MB:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm4MB
+
+match_extend_back_loop_encodeBlockAsm4MB:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm4MB
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm4MB
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm4MB
+	JMP  match_extend_back_loop_encodeBlockAsm4MB
+
+match_extend_back_end_encodeBlockAsm4MB:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 4(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeBlockAsm4MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm4MB:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm4MB
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JB   one_byte_match_emit_encodeBlockAsm4MB
+	CMPL R8, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm4MB
+	CMPL R8, $0x00010000
+	JB   three_bytes_match_emit_encodeBlockAsm4MB
+	MOVL R8, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R8, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_match_emit_encodeBlockAsm4MB
+
+three_bytes_match_emit_encodeBlockAsm4MB:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeBlockAsm4MB
+
+two_bytes_match_emit_encodeBlockAsm4MB:
+	MOVB $0xf0, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_match_emit_encodeBlockAsm4MB
+	JMP  memmove_long_match_emit_encodeBlockAsm4MB
+
+one_byte_match_emit_encodeBlockAsm4MB:
+	SHLB $0x02, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeBlockAsm4MB:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (CX)
+	MOVQ DI, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm4MB:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm4MB
+
+memmove_long_match_emit_encodeBlockAsm4MB:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_match_emit_encodeBlockAsm4MB:
+match_nolit_loop_encodeBlockAsm4MB:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm4MB:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBlockAsm4MB
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm4MB
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm4MB
+
+matchlen_bsf_16match_nolit_encodeBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_encodeBlockAsm4MB
+
+matchlen_match8_match_nolit_encodeBlockAsm4MB:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm4MB
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm4MB
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_encodeBlockAsm4MB
+
+matchlen_match4_match_nolit_encodeBlockAsm4MB:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm4MB
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm4MB
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeBlockAsm4MB:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm4MB
+	JB   match_nolit_end_encodeBlockAsm4MB
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm4MB
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm4MB
+
+matchlen_match1_match_nolit_encodeBlockAsm4MB:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeBlockAsm4MB
+	LEAL 1(R10), R10
+
+match_nolit_end_encodeBlockAsm4MB:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL SI, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeBlockAsm4MB
+	CMPL R10, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeBlockAsm4MB
+	MOVB $0xff, (CX)
+	MOVL SI, 1(CX)
+	LEAL -64(R10), R10
+	ADDQ $0x05, CX
+	CMPL R10, $0x04
+	JB   four_bytes_remain_match_nolit_encodeBlockAsm4MB
+
+	// emitRepeat
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
+	CMPL R10, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
+	LEAL -65536(R10), R10
+	MOVL R10, SI
+	MOVW $0x001d, (CX)
+	MOVW R10, 2(CX)
+	SARL $0x10, SI
+	MOVB SI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+four_bytes_remain_match_nolit_encodeBlockAsm4MB:
+	TESTL R10, R10
+	JZ    match_nolit_emitcopy_end_encodeBlockAsm4MB
+	XORL  DI, DI
+	LEAL  -1(DI)(R10*4), R10
+	MOVB  R10, (CX)
+	MOVL  SI, 1(CX)
+	ADDQ  $0x05, CX
+	JMP   match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+two_byte_offset_match_nolit_encodeBlockAsm4MB:
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm4MB
+	CMPL SI, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBlockAsm4MB
+	MOVL $0x00000001, DI
+	LEAL 16(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, R10
+
+	// emitRepeat
+	LEAL -4(R10), R10
+	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+	CMPL R10, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
+	LEAL -65536(R10), R10
+	MOVL R10, SI
+	MOVW $0x001d, (CX)
+	MOVW R10, 2(CX)
+	SARL $0x10, SI
+	MOVB SI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+long_offset_short_match_nolit_encodeBlockAsm4MB:
+	MOVB $0xee, (CX)
+	MOVW SI, 1(CX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	CMPL R10, $0x00010100
+	JB   repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
+	LEAL -65536(R10), R10
+	MOVL R10, SI
+	MOVW $0x001d, (CX)
+	MOVW R10, 2(CX)
+	SARL $0x10, SI
+	MOVB SI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
+	MOVL R10, DI
+	SHLL $0x02, DI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm4MB
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm4MB
+	LEAL -15(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
+
+emit_copy_three_match_nolit_encodeBlockAsm4MB:
+	LEAL -2(DI), DI
+	MOVB DI, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm4MB:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm4MB
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm4MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm4MB:
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x10, R8
+	IMULQ R9, R8
+	SHRQ  $0x32, R8
+	SHLQ  $0x10, SI
+	IMULQ R9, SI
+	SHRQ  $0x32, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeBlockAsm4MB
+	INCL  DX
+	JMP   search_loop_encodeBlockAsm4MB
+
+emit_remainder_encodeBlockAsm4MB:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 4(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBlockAsm4MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm4MB:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm4MB
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBlockAsm4MB
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm4MB
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBlockAsm4MB
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (CX)
+	MOVW DX, 1(CX)
+	MOVB BL, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
+
+three_bytes_emit_remainder_encodeBlockAsm4MB:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
+
+two_bytes_emit_remainder_encodeBlockAsm4MB:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBlockAsm4MB
+	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
+
+one_byte_emit_remainder_encodeBlockAsm4MB:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeBlockAsm4MB:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm4MB
+
+memmove_long_emit_remainder_encodeBlockAsm4MB:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBlockAsm4MB:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBlockAsm12B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000080, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm12B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm12B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm12B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x000000cf1bbcdcbb, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x18, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x18, R11
+	IMULQ R9, R11
+	SHRQ  $0x34, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x18, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm12B
+	LEAL  1(DX), DI
+	MOVL  12(SP), R8
+	MOVL  DI, SI
+	SUBL  16(SP), SI
+	JZ    repeat_extend_back_end_encodeBlockAsm12B
+
+repeat_extend_back_loop_encodeBlockAsm12B:
+	CMPL DI, R8
+	JBE  repeat_extend_back_end_encodeBlockAsm12B
+	MOVB -1(BX)(SI*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm12B
+	LEAL -1(DI), DI
+	DECL SI
+	JNZ  repeat_extend_back_loop_encodeBlockAsm12B
+
+repeat_extend_back_end_encodeBlockAsm12B:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 3(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBlockAsm12B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_encodeBlockAsm12B
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBlockAsm12B
+	JB   three_bytes_repeat_emit_encodeBlockAsm12B
+
+three_bytes_repeat_emit_encodeBlockAsm12B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm12B
+
+two_bytes_repeat_emit_encodeBlockAsm12B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_encodeBlockAsm12B
+	JMP  memmove_long_repeat_emit_encodeBlockAsm12B
+
+one_byte_repeat_emit_encodeBlockAsm12B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_encodeBlockAsm12B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm12B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm12B
+
+memmove_long_repeat_emit_encodeBlockAsm12B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R13*1), R11
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R13*1), X4
+	MOVOU -16(R10)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R9, R13
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeBlockAsm12B:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R9
+	SUBL DX, R9
+	LEAQ (BX)(DX*1), R10
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm12B:
+	CMPL R9, $0x10
+	JB   matchlen_match8_repeat_extend_encodeBlockAsm12B
+	MOVQ (R10)(R12*1), R11
+	MOVQ 8(R10)(R12*1), R13
+	XORQ (SI)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
+	XORQ 8(SI)(R12*1), R13
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm12B
+	LEAL -16(R9), R9
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm12B
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_match8_repeat_extend_encodeBlockAsm12B:
+	CMPL R9, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm12B
+	MOVQ (R10)(R12*1), R11
+	XORQ (SI)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
+	LEAL -8(R9), R9
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm12B
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_match4_repeat_extend_encodeBlockAsm12B:
+	CMPL R9, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm12B
+	MOVL (R10)(R12*1), R11
+	CMPL (SI)(R12*1), R11
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm12B
+	LEAL -4(R9), R9
+	LEAL 4(R12), R12
+
+matchlen_match2_repeat_extend_encodeBlockAsm12B:
+	CMPL R9, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm12B
+	JB   repeat_extend_forward_end_encodeBlockAsm12B
+	MOVW (R10)(R12*1), R11
+	CMPW (SI)(R12*1), R11
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm12B
+	LEAL 2(R12), R12
+	SUBL $0x02, R9
+	JZ   repeat_extend_forward_end_encodeBlockAsm12B
+
+matchlen_match1_repeat_extend_encodeBlockAsm12B:
+	MOVB (R10)(R12*1), R11
+	CMPB (SI)(R12*1), R11
+	JNE  repeat_extend_forward_end_encodeBlockAsm12B
+	LEAL 1(R12), R12
+
+repeat_extend_forward_end_encodeBlockAsm12B:
+	ADDL  R12, DX
+	MOVL  DX, SI
+	SUBL  DI, SI
+	MOVL  16(SP), DI
+	TESTL R8, R8
+	JZ    repeat_as_copy_encodeBlockAsm12B
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_match_repeat_encodeBlockAsm12B
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_repeat_encodeBlockAsm12B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
+	CMPL SI, $0x00000104
+	JB   repeat_three_match_repeat_encodeBlockAsm12B
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_match_repeat_encodeBlockAsm12B:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_match_repeat_encodeBlockAsm12B:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_match_repeat_encodeBlockAsm12B:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_as_copy_encodeBlockAsm12B:
+	// emitCopy
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
+	CMPL DI, $0x00000800
+	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm12B
+	MOVL $0x00000001, R8
+	LEAL 16(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, SI
+
+	// emitRepeat
+	LEAL -4(SI), SI
+	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+long_offset_short_repeat_as_copy_encodeBlockAsm12B:
+	MOVB $0xee, (CX)
+	MOVW DI, 1(CX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
+	MOVL SI, R8
+	SHLL $0x02, R8
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
+	LEAL -15(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm12B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
+	LEAL -2(R8), R8
+	MOVB R8, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+
+repeat_end_emit_encodeBlockAsm12B:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm12B
+
+no_repeat_found_encodeBlockAsm12B:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm12B
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm12B
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm12B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBlockAsm12B
+
+candidate3_match_encodeBlockAsm12B:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm12B
+
+candidate2_match_encodeBlockAsm12B:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm12B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm12B
+
+match_extend_back_loop_encodeBlockAsm12B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm12B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm12B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm12B
+	JMP  match_extend_back_loop_encodeBlockAsm12B
+
+match_extend_back_end_encodeBlockAsm12B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm12B:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JB   one_byte_match_emit_encodeBlockAsm12B
+	CMPL R8, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm12B
+	JB   three_bytes_match_emit_encodeBlockAsm12B
+
+three_bytes_match_emit_encodeBlockAsm12B:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeBlockAsm12B
+
+two_bytes_match_emit_encodeBlockAsm12B:
+	MOVB $0xf0, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_match_emit_encodeBlockAsm12B
+	JMP  memmove_long_match_emit_encodeBlockAsm12B
+
+one_byte_match_emit_encodeBlockAsm12B:
+	SHLB $0x02, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeBlockAsm12B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (CX)
+	MOVQ DI, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm12B:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm12B
+
+memmove_long_match_emit_encodeBlockAsm12B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_match_emit_encodeBlockAsm12B:
+match_nolit_loop_encodeBlockAsm12B:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm12B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBlockAsm12B
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm12B
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm12B
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm12B
+
+matchlen_bsf_16match_nolit_encodeBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_encodeBlockAsm12B
+
+matchlen_match8_match_nolit_encodeBlockAsm12B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm12B
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm12B
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm12B
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_encodeBlockAsm12B
+
+matchlen_match4_match_nolit_encodeBlockAsm12B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm12B
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm12B
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeBlockAsm12B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm12B
+	JB   match_nolit_end_encodeBlockAsm12B
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm12B
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm12B
+
+matchlen_match1_match_nolit_encodeBlockAsm12B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeBlockAsm12B
+	LEAL 1(R10), R10
+
+match_nolit_end_encodeBlockAsm12B:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm12B
+	CMPL SI, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBlockAsm12B
+	MOVL $0x00000001, DI
+	LEAL 16(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, R10
+
+	// emitRepeat
+	LEAL -4(R10), R10
+	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+long_offset_short_match_nolit_encodeBlockAsm12B:
+	MOVB $0xee, (CX)
+	MOVW SI, 1(CX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm12B:
+	MOVL R10, DI
+	SHLL $0x02, DI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm12B
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm12B
+	LEAL -15(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
+
+emit_copy_three_match_nolit_encodeBlockAsm12B:
+	LEAL -2(DI), DI
+	MOVB DI, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm12B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm12B
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm12B:
+	MOVQ  $0x000000cf1bbcdcbb, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x18, R8
+	IMULQ R9, R8
+	SHRQ  $0x34, R8
+	SHLQ  $0x18, SI
+	IMULQ R9, SI
+	SHRQ  $0x34, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeBlockAsm12B
+	INCL  DX
+	JMP   search_loop_encodeBlockAsm12B
+
+emit_remainder_encodeBlockAsm12B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm12B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm12B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBlockAsm12B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm12B
+	JB   three_bytes_emit_remainder_encodeBlockAsm12B
+
+three_bytes_emit_remainder_encodeBlockAsm12B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm12B
+
+two_bytes_emit_remainder_encodeBlockAsm12B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeBlockAsm12B
+
+one_byte_emit_remainder_encodeBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeBlockAsm12B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm12B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm12B
+
+memmove_long_emit_remainder_encodeBlockAsm12B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBlockAsm12B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBlockAsm10B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000020, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm10B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm10B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm10B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ R9, R11
+	SHRQ  $0x36, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm10B
+	LEAL  1(DX), DI
+	MOVL  12(SP), R8
+	MOVL  DI, SI
+	SUBL  16(SP), SI
+	JZ    repeat_extend_back_end_encodeBlockAsm10B
+
+repeat_extend_back_loop_encodeBlockAsm10B:
+	CMPL DI, R8
+	JBE  repeat_extend_back_end_encodeBlockAsm10B
+	MOVB -1(BX)(SI*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm10B
+	LEAL -1(DI), DI
+	DECL SI
+	JNZ  repeat_extend_back_loop_encodeBlockAsm10B
+
+repeat_extend_back_end_encodeBlockAsm10B:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 3(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBlockAsm10B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_encodeBlockAsm10B
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBlockAsm10B
+	JB   three_bytes_repeat_emit_encodeBlockAsm10B
+
+three_bytes_repeat_emit_encodeBlockAsm10B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm10B
+
+two_bytes_repeat_emit_encodeBlockAsm10B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_encodeBlockAsm10B
+	JMP  memmove_long_repeat_emit_encodeBlockAsm10B
+
+one_byte_repeat_emit_encodeBlockAsm10B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_encodeBlockAsm10B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm10B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm10B
+
+memmove_long_repeat_emit_encodeBlockAsm10B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R13*1), R11
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R13*1), X4
+	MOVOU -16(R10)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R9, R13
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeBlockAsm10B:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R9
+	SUBL DX, R9
+	LEAQ (BX)(DX*1), R10
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm10B:
+	CMPL R9, $0x10
+	JB   matchlen_match8_repeat_extend_encodeBlockAsm10B
+	MOVQ (R10)(R12*1), R11
+	MOVQ 8(R10)(R12*1), R13
+	XORQ (SI)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
+	XORQ 8(SI)(R12*1), R13
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm10B
+	LEAL -16(R9), R9
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm10B
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_match8_repeat_extend_encodeBlockAsm10B:
+	CMPL R9, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm10B
+	MOVQ (R10)(R12*1), R11
+	XORQ (SI)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
+	LEAL -8(R9), R9
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm10B
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_match4_repeat_extend_encodeBlockAsm10B:
+	CMPL R9, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm10B
+	MOVL (R10)(R12*1), R11
+	CMPL (SI)(R12*1), R11
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm10B
+	LEAL -4(R9), R9
+	LEAL 4(R12), R12
+
+matchlen_match2_repeat_extend_encodeBlockAsm10B:
+	CMPL R9, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm10B
+	JB   repeat_extend_forward_end_encodeBlockAsm10B
+	MOVW (R10)(R12*1), R11
+	CMPW (SI)(R12*1), R11
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm10B
+	LEAL 2(R12), R12
+	SUBL $0x02, R9
+	JZ   repeat_extend_forward_end_encodeBlockAsm10B
+
+matchlen_match1_repeat_extend_encodeBlockAsm10B:
+	MOVB (R10)(R12*1), R11
+	CMPB (SI)(R12*1), R11
+	JNE  repeat_extend_forward_end_encodeBlockAsm10B
+	LEAL 1(R12), R12
+
+repeat_extend_forward_end_encodeBlockAsm10B:
+	ADDL  R12, DX
+	MOVL  DX, SI
+	SUBL  DI, SI
+	MOVL  16(SP), DI
+	TESTL R8, R8
+	JZ    repeat_as_copy_encodeBlockAsm10B
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_match_repeat_encodeBlockAsm10B
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_match_repeat_encodeBlockAsm10B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
+	CMPL SI, $0x00000104
+	JB   repeat_three_match_repeat_encodeBlockAsm10B
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_match_repeat_encodeBlockAsm10B:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_match_repeat_encodeBlockAsm10B:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_match_repeat_encodeBlockAsm10B:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_as_copy_encodeBlockAsm10B:
+	// emitCopy
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
+	CMPL DI, $0x00000800
+	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm10B
+	MOVL $0x00000001, R8
+	LEAL 16(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, SI
+
+	// emitRepeat
+	LEAL -4(SI), SI
+	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+long_offset_short_repeat_as_copy_encodeBlockAsm10B:
+	MOVB $0xee, (CX)
+	MOVW DI, 1(CX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL SI, R8
+	LEAL -4(SI), SI
+	CMPL R8, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+	CMPL DI, $0x00000800
+	JB   repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
+	MOVL SI, R8
+	SHLL $0x02, R8
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
+	LEAL -15(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm10B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
+	LEAL -2(R8), R8
+	MOVB R8, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+
+repeat_end_emit_encodeBlockAsm10B:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm10B
+
+no_repeat_found_encodeBlockAsm10B:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm10B
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm10B
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm10B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBlockAsm10B
+
+candidate3_match_encodeBlockAsm10B:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm10B
+
+candidate2_match_encodeBlockAsm10B:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm10B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm10B
+
+match_extend_back_loop_encodeBlockAsm10B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm10B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm10B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm10B
+	JMP  match_extend_back_loop_encodeBlockAsm10B
+
+match_extend_back_end_encodeBlockAsm10B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm10B:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JB   one_byte_match_emit_encodeBlockAsm10B
+	CMPL R8, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm10B
+	JB   three_bytes_match_emit_encodeBlockAsm10B
+
+three_bytes_match_emit_encodeBlockAsm10B:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeBlockAsm10B
+
+two_bytes_match_emit_encodeBlockAsm10B:
+	MOVB $0xf0, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_match_emit_encodeBlockAsm10B
+	JMP  memmove_long_match_emit_encodeBlockAsm10B
+
+one_byte_match_emit_encodeBlockAsm10B:
+	SHLB $0x02, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeBlockAsm10B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (CX)
+	MOVQ DI, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm10B:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm10B
+
+memmove_long_match_emit_encodeBlockAsm10B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_match_emit_encodeBlockAsm10B:
+match_nolit_loop_encodeBlockAsm10B:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm10B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBlockAsm10B
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm10B
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm10B
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm10B
+
+matchlen_bsf_16match_nolit_encodeBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_encodeBlockAsm10B
+
+matchlen_match8_match_nolit_encodeBlockAsm10B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm10B
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm10B
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm10B
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_encodeBlockAsm10B
+
+matchlen_match4_match_nolit_encodeBlockAsm10B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm10B
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm10B
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeBlockAsm10B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm10B
+	JB   match_nolit_end_encodeBlockAsm10B
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm10B
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm10B
+
+matchlen_match1_match_nolit_encodeBlockAsm10B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeBlockAsm10B
+	LEAL 1(R10), R10
+
+match_nolit_end_encodeBlockAsm10B:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm10B
+	CMPL SI, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBlockAsm10B
+	MOVL $0x00000001, DI
+	LEAL 16(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, R10
+
+	// emitRepeat
+	LEAL -4(R10), R10
+	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+long_offset_short_match_nolit_encodeBlockAsm10B:
+	MOVB $0xee, (CX)
+	MOVW SI, 1(CX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL R10, DI
+	LEAL -4(R10), R10
+	CMPL DI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
+	CMPL SI, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm10B:
+	MOVL R10, DI
+	SHLL $0x02, DI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm10B
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm10B
+	LEAL -15(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
+
+emit_copy_three_match_nolit_encodeBlockAsm10B:
+	LEAL -2(DI), DI
+	MOVB DI, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm10B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm10B
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm10B:
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x20, R8
+	IMULQ R9, R8
+	SHRQ  $0x36, R8
+	SHLQ  $0x20, SI
+	IMULQ R9, SI
+	SHRQ  $0x36, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeBlockAsm10B
+	INCL  DX
+	JMP   search_loop_encodeBlockAsm10B
+
+emit_remainder_encodeBlockAsm10B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm10B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm10B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBlockAsm10B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm10B
+	JB   three_bytes_emit_remainder_encodeBlockAsm10B
+
+three_bytes_emit_remainder_encodeBlockAsm10B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm10B
+
+two_bytes_emit_remainder_encodeBlockAsm10B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeBlockAsm10B
+
+one_byte_emit_remainder_encodeBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeBlockAsm10B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm10B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm10B
+
+memmove_long_emit_remainder_encodeBlockAsm10B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBlockAsm10B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBlockAsm8B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000008, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm8B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm8B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x04, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm8B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x38, R10
+	SHLQ  $0x20, R11
+	IMULQ R9, R11
+	SHRQ  $0x38, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x38, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm8B
+	LEAL  1(DX), DI
+	MOVL  12(SP), R8
+	MOVL  DI, SI
+	SUBL  16(SP), SI
+	JZ    repeat_extend_back_end_encodeBlockAsm8B
+
+repeat_extend_back_loop_encodeBlockAsm8B:
+	CMPL DI, R8
+	JBE  repeat_extend_back_end_encodeBlockAsm8B
+	MOVB -1(BX)(SI*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm8B
+	LEAL -1(DI), DI
+	DECL SI
+	JNZ  repeat_extend_back_loop_encodeBlockAsm8B
+
+repeat_extend_back_end_encodeBlockAsm8B:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 3(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBlockAsm8B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm8B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_encodeBlockAsm8B
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBlockAsm8B
+	JB   three_bytes_repeat_emit_encodeBlockAsm8B
+
+three_bytes_repeat_emit_encodeBlockAsm8B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_encodeBlockAsm8B
+
+two_bytes_repeat_emit_encodeBlockAsm8B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_encodeBlockAsm8B
+	JMP  memmove_long_repeat_emit_encodeBlockAsm8B
+
+one_byte_repeat_emit_encodeBlockAsm8B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_encodeBlockAsm8B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBlockAsm8B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBlockAsm8B
+
+memmove_long_repeat_emit_encodeBlockAsm8B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R13*1), R11
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R13*1), X4
+	MOVOU -16(R10)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R9, R13
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeBlockAsm8B:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R9
+	SUBL DX, R9
+	LEAQ (BX)(DX*1), R10
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm8B:
+	CMPL R9, $0x10
+	JB   matchlen_match8_repeat_extend_encodeBlockAsm8B
+	MOVQ (R10)(R12*1), R11
+	MOVQ 8(R10)(R12*1), R13
+	XORQ (SI)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
+	XORQ 8(SI)(R12*1), R13
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm8B
+	LEAL -16(R9), R9
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_repeat_extend_encodeBlockAsm8B
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_match8_repeat_extend_encodeBlockAsm8B:
+	CMPL R9, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm8B
+	MOVQ (R10)(R12*1), R11
+	XORQ (SI)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
+	LEAL -8(R9), R9
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm8B
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_match4_repeat_extend_encodeBlockAsm8B:
+	CMPL R9, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm8B
+	MOVL (R10)(R12*1), R11
+	CMPL (SI)(R12*1), R11
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm8B
+	LEAL -4(R9), R9
+	LEAL 4(R12), R12
+
+matchlen_match2_repeat_extend_encodeBlockAsm8B:
+	CMPL R9, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm8B
+	JB   repeat_extend_forward_end_encodeBlockAsm8B
+	MOVW (R10)(R12*1), R11
+	CMPW (SI)(R12*1), R11
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm8B
+	LEAL 2(R12), R12
+	SUBL $0x02, R9
+	JZ   repeat_extend_forward_end_encodeBlockAsm8B
+
+matchlen_match1_repeat_extend_encodeBlockAsm8B:
+	MOVB (R10)(R12*1), R11
+	CMPB (SI)(R12*1), R11
+	JNE  repeat_extend_forward_end_encodeBlockAsm8B
+	LEAL 1(R12), R12
+
+repeat_extend_forward_end_encodeBlockAsm8B:
+	ADDL  R12, DX
+	MOVL  DX, SI
+	SUBL  DI, SI
+	MOVL  16(SP), DI
+	TESTL R8, R8
+	JZ    repeat_as_copy_encodeBlockAsm8B
+
+	// emitRepeat
+	MOVL SI, DI
+	LEAL -4(SI), SI
+	CMPL DI, $0x08
+	JBE  repeat_two_match_repeat_encodeBlockAsm8B
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
+
+cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
+	CMPL SI, $0x00000104
+	JB   repeat_three_match_repeat_encodeBlockAsm8B
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_match_repeat_encodeBlockAsm8B:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_match_repeat_encodeBlockAsm8B:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_as_copy_encodeBlockAsm8B:
+	// emitCopy
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
+	CMPL DI, $0x00000800
+	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm8B
+	MOVL $0x00000001, R8
+	LEAL 16(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, SI
+
+	// emitRepeat
+	LEAL -4(SI), SI
+	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+	MOVL SI, DI
+	LEAL -4(SI), SI
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+long_offset_short_repeat_as_copy_encodeBlockAsm8B:
+	MOVB $0xee, (CX)
+	MOVW DI, 1(CX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL SI, DI
+	LEAL -4(SI), SI
+	CMPL DI, $0x08
+	JBE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+	CMPL DI, $0x0c
+	JAE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+	CMPL SI, $0x00000104
+	JB   repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
+	LEAL -256(SI), SI
+	MOVW $0x0019, (CX)
+	MOVW SI, 2(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+	LEAL -4(SI), SI
+	MOVW $0x0015, (CX)
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
+	SHLL $0x02, SI
+	ORL  $0x01, SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+	XORQ R8, R8
+	LEAL 1(R8)(SI*4), SI
+	MOVB DI, 1(CX)
+	SARL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
+	MOVL SI, R8
+	SHLL $0x02, R8
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeBlockAsm8B
+	LEAL -15(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm8B
+
+emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
+	LEAL -2(R8), R8
+	MOVB R8, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+
+repeat_end_emit_encodeBlockAsm8B:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm8B
+
+no_repeat_found_encodeBlockAsm8B:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm8B
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm8B
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm8B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBlockAsm8B
+
+candidate3_match_encodeBlockAsm8B:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm8B
+
+candidate2_match_encodeBlockAsm8B:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm8B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm8B
+
+match_extend_back_loop_encodeBlockAsm8B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm8B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm8B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm8B
+	JMP  match_extend_back_loop_encodeBlockAsm8B
+
+match_extend_back_end_encodeBlockAsm8B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBlockAsm8B:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeBlockAsm8B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JB   one_byte_match_emit_encodeBlockAsm8B
+	CMPL R8, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm8B
+	JB   three_bytes_match_emit_encodeBlockAsm8B
+
+three_bytes_match_emit_encodeBlockAsm8B:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeBlockAsm8B
+
+two_bytes_match_emit_encodeBlockAsm8B:
+	MOVB $0xf0, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_match_emit_encodeBlockAsm8B
+	JMP  memmove_long_match_emit_encodeBlockAsm8B
+
+one_byte_match_emit_encodeBlockAsm8B:
+	SHLB $0x02, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeBlockAsm8B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (CX)
+	MOVQ DI, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm8B:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_encodeBlockAsm8B
+
+memmove_long_match_emit_encodeBlockAsm8B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_match_emit_encodeBlockAsm8B:
+match_nolit_loop_encodeBlockAsm8B:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm8B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeBlockAsm8B
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm8B
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm8B
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_encodeBlockAsm8B
+
+matchlen_bsf_16match_nolit_encodeBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_encodeBlockAsm8B
+
+matchlen_match8_match_nolit_encodeBlockAsm8B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm8B
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm8B
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm8B
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_encodeBlockAsm8B
+
+matchlen_match4_match_nolit_encodeBlockAsm8B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm8B
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm8B
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeBlockAsm8B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm8B
+	JB   match_nolit_end_encodeBlockAsm8B
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm8B
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm8B
+
+matchlen_match1_match_nolit_encodeBlockAsm8B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeBlockAsm8B
+	LEAL 1(R10), R10
+
+match_nolit_end_encodeBlockAsm8B:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBlockAsm8B
+	CMPL SI, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBlockAsm8B
+	MOVL $0x00000001, DI
+	LEAL 16(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, R10
+
+	// emitRepeat
+	LEAL -4(R10), R10
+	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+	MOVL R10, SI
+	LEAL -4(R10), R10
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+long_offset_short_match_nolit_encodeBlockAsm8B:
+	MOVB $0xee, (CX)
+	MOVW SI, 1(CX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL R10, SI
+	LEAL -4(R10), R10
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
+	CMPL R10, $0x00000104
+	JB   repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
+	LEAL -256(R10), R10
+	MOVW $0x0019, (CX)
+	MOVW R10, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (CX)
+	MOVB R10, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+	XORQ DI, DI
+	LEAL 1(DI)(R10*4), R10
+	MOVB SI, 1(CX)
+	SARL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, R10
+	MOVB R10, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeBlockAsm8B:
+	MOVL R10, DI
+	SHLL $0x02, DI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBlockAsm8B
+	LEAL -15(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
+
+emit_copy_three_match_nolit_encodeBlockAsm8B:
+	LEAL -2(DI), DI
+	MOVB DI, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm8B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm8B
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm8B:
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x20, R8
+	IMULQ R9, R8
+	SHRQ  $0x38, R8
+	SHLQ  $0x20, SI
+	IMULQ R9, SI
+	SHRQ  $0x38, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeBlockAsm8B
+	INCL  DX
+	JMP   search_loop_encodeBlockAsm8B
+
+emit_remainder_encodeBlockAsm8B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBlockAsm8B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm8B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBlockAsm8B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm8B
+	JB   three_bytes_emit_remainder_encodeBlockAsm8B
+
+three_bytes_emit_remainder_encodeBlockAsm8B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm8B
+
+two_bytes_emit_remainder_encodeBlockAsm8B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeBlockAsm8B
+
+one_byte_emit_remainder_encodeBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeBlockAsm8B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm8B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBlockAsm8B
+
+memmove_long_emit_remainder_encodeBlockAsm8B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBlockAsm8B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBetterBlockAsm(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00001200, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -6(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBetterBlockAsm:
+	MOVL DX, SI
+	SUBL 12(SP), SI
+	SHRL $0x07, SI
+	CMPL SI, $0x63
+	JBE  check_maxskip_ok_encodeBetterBlockAsm
+	LEAL 100(DX), SI
+	JMP  check_maxskip_cont_encodeBetterBlockAsm
+
+check_maxskip_ok_encodeBetterBlockAsm:
+	LEAL 1(DX)(SI*1), SI
+
+check_maxskip_cont_encodeBetterBlockAsm:
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x32, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  524288(AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, 524288(AX)(R11*4)
+	MOVQ  (BX)(SI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeBetterBlockAsm
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeBetterBlockAsm
+	MOVL  R8, SI
+	JMP   candidate_match_encodeBetterBlockAsm
+
+no_short_found_encodeBetterBlockAsm:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBetterBlockAsm
+
+candidateS_match_encodeBetterBlockAsm:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x2f, R10
+	MOVL  (AX)(R10*4), SI
+	INCL  DX
+	MOVL  DX, (AX)(R10*4)
+	CMPL  (BX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm
+	DECL  DX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm
+
+match_extend_back_loop_encodeBetterBlockAsm:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBetterBlockAsm
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBetterBlockAsm
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm
+	JMP  match_extend_back_loop_encodeBetterBlockAsm
+
+match_extend_back_end_encodeBetterBlockAsm:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 5(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm:
+	MOVL DX, DI
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm:
+	CMPL R8, $0x10
+	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm
+	MOVQ (R9)(R12*1), R11
+	MOVQ 8(R9)(R12*1), R13
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
+	XORQ 8(R10)(R12*1), R13
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm
+	LEAL -16(R8), R8
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  match_nolit_end_encodeBetterBlockAsm
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm:
+	CMPL R8, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm
+	MOVQ (R9)(R12*1), R11
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  match_nolit_end_encodeBetterBlockAsm
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm:
+	CMPL R8, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm
+	MOVL (R9)(R12*1), R11
+	CMPL (R10)(R12*1), R11
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm
+	LEAL -4(R8), R8
+	LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm:
+	CMPL R8, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm
+	JB   match_nolit_end_encodeBetterBlockAsm
+	MOVW (R9)(R12*1), R11
+	CMPW (R10)(R12*1), R11
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm
+	LEAL 2(R12), R12
+	SUBL $0x02, R8
+	JZ   match_nolit_end_encodeBetterBlockAsm
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeBetterBlockAsm
+	LEAL 1(R12), R12
+
+match_nolit_end_encodeBetterBlockAsm:
+	MOVL DX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL 16(SP), R8
+	JEQ  match_is_repeat_encodeBetterBlockAsm
+	CMPL R12, $0x01
+	JA   match_length_ok_encodeBetterBlockAsm
+	CMPL R8, $0x0000ffff
+	JBE  match_length_ok_encodeBetterBlockAsm
+	MOVL 20(SP), DX
+	INCL DX
+	JMP  search_loop_encodeBetterBlockAsm
+
+match_length_ok_encodeBetterBlockAsm:
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_encodeBetterBlockAsm
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm
+	CMPL SI, $0x00010000
+	JB   three_bytes_match_emit_encodeBetterBlockAsm
+	CMPL SI, $0x01000000
+	JB   four_bytes_match_emit_encodeBetterBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x05, CX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+four_bytes_match_emit_encodeBetterBlockAsm:
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R11, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+three_bytes_match_emit_encodeBetterBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+two_bytes_match_emit_encodeBetterBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_encodeBetterBlockAsm
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+one_byte_match_emit_encodeBetterBlockAsm:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeBetterBlockAsm:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (CX)
+	MOVL R10, -4(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm
+
+memmove_long_match_emit_encodeBetterBlockAsm:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL R8, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeBetterBlockAsm
+	CMPL R12, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeBetterBlockAsm
+	MOVB $0xff, (CX)
+	MOVL R8, 1(CX)
+	LEAL -64(R12), R12
+	ADDQ $0x05, CX
+	CMPL R12, $0x04
+	JB   four_bytes_remain_match_nolit_encodeBetterBlockAsm
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL R12, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
+	CMPL R12, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
+	LEAL -16842747(R12), R12
+	MOVL $0xfffb001d, (CX)
+	MOVB $0xff, 4(CX)
+	ADDQ $0x05, CX
+	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (CX)
+	MOVW R12, 2(CX)
+	SARL $0x10, R8
+	MOVB R8, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+four_bytes_remain_match_nolit_encodeBetterBlockAsm:
+	TESTL R12, R12
+	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm
+	XORL  SI, SI
+	LEAL  -1(SI)(R12*4), R12
+	MOVB  R12, (CX)
+	MOVL  R8, 1(CX)
+	ADDQ  $0x05, CX
+	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm:
+	CMPL R12, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm
+	CMPL R8, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB R8, 1(CX)
+	MOVL R8, R9
+	SHRL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, R12
+
+	// emitRepeat
+	LEAL -4(R12), R12
+	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+	CMPL R12, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+	CMPL R12, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+	LEAL -16842747(R12), R12
+	MOVL $0xfffb001d, (CX)
+	MOVB $0xff, 4(CX)
+	ADDQ $0x05, CX
+	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (CX)
+	MOVW R12, 2(CX)
+	SARL $0x10, R8
+	MOVB R8, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+long_offset_short_match_nolit_encodeBetterBlockAsm:
+	MOVB $0xee, (CX)
+	MOVW R8, 1(CX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, CX
+
+	// emitRepeat
+emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL R12, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	CMPL R12, $0x0100ffff
+	JB   repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
+	LEAL -16842747(R12), R12
+	MOVL $0xfffb001d, (CX)
+	MOVB $0xff, 4(CX)
+	ADDQ $0x05, CX
+	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
+
+repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (CX)
+	MOVW R12, 2(CX)
+	SARL $0x10, R8
+	MOVB R8, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
+	MOVL R12, SI
+	SHLL $0x02, SI
+	CMPL R12, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm
+	LEAL -15(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm:
+	LEAL -2(SI), SI
+	MOVB SI, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+match_is_repeat_encodeBetterBlockAsm:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm
+	CMPL SI, $0x00010000
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm
+	CMPL SI, $0x01000000
+	JB   four_bytes_match_emit_repeat_encodeBetterBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x05, CX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+four_bytes_match_emit_repeat_encodeBetterBlockAsm:
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R11, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_repeat_encodeBetterBlockAsm
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (CX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (CX)
+	MOVL R10, -4(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitRepeat
+emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL R12, $0x00010100
+	JB   repeat_four_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL R12, $0x0100ffff
+	JB   repeat_five_match_nolit_repeat_encodeBetterBlockAsm
+	LEAL -16842747(R12), R12
+	MOVL $0xfffb001d, (CX)
+	MOVB $0xff, 4(CX)
+	ADDQ $0x05, CX
+	JMP  emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
+
+repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (CX)
+	MOVW R12, 2(CX)
+	SARL $0x10, R8
+	MOVB R8, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm:
+	MOVQ  $0x00cf1bbcdcbfa563, SI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(DI), DI
+	LEAQ  -2(DX), R9
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  1(BX)(DI*1), R11
+	MOVQ  (BX)(R9*1), R12
+	MOVQ  1(BX)(R9*1), R13
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x08, R12
+	IMULQ SI, R12
+	SHRQ  $0x2f, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x32, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R9, (AX)(R12*4)
+	MOVL  R8, 524288(AX)(R11*4)
+	MOVL  R14, 524288(AX)(R13*4)
+	LEAQ  1(R9)(DI*1), R8
+	SHRQ  $0x01, R8
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeBetterBlockAsm:
+	CMPQ  R8, R9
+	JAE   search_loop_encodeBetterBlockAsm
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x08, R11
+	IMULQ SI, R11
+	SHRQ  $0x2f, R11
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R8, (AX)(R11*4)
+	ADDQ  $0x02, DI
+	ADDQ  $0x02, R8
+	JMP   index_loop_encodeBetterBlockAsm
+
+emit_remainder_encodeBetterBlockAsm:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 5(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm
+	CMPL DX, $0x01000000
+	JB   four_bytes_emit_remainder_encodeBetterBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL DX, 1(CX)
+	ADDQ $0x05, CX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+four_bytes_emit_remainder_encodeBetterBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (CX)
+	MOVW DX, 1(CX)
+	MOVB BL, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+three_bytes_emit_remainder_encodeBetterBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+two_bytes_emit_remainder_encodeBetterBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBetterBlockAsm
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+one_byte_emit_remainder_encodeBetterBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeBetterBlockAsm:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm
+
+memmove_long_emit_remainder_encodeBetterBlockAsm:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm4MB(dst []byte, src []byte, tmp *[589824]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBetterBlockAsm4MB(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00001200, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm4MB:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm4MB
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -6(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBetterBlockAsm4MB:
+	MOVL DX, SI
+	SUBL 12(SP), SI
+	SHRL $0x07, SI
+	CMPL SI, $0x63
+	JBE  check_maxskip_ok_encodeBetterBlockAsm4MB
+	LEAL 100(DX), SI
+	JMP  check_maxskip_cont_encodeBetterBlockAsm4MB
+
+check_maxskip_ok_encodeBetterBlockAsm4MB:
+	LEAL 1(DX)(SI*1), SI
+
+check_maxskip_cont_encodeBetterBlockAsm4MB:
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm4MB
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x32, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  524288(AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, 524288(AX)(R11*4)
+	MOVQ  (BX)(SI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeBetterBlockAsm4MB
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeBetterBlockAsm4MB
+	MOVL  R8, SI
+	JMP   candidate_match_encodeBetterBlockAsm4MB
+
+no_short_found_encodeBetterBlockAsm4MB:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm4MB
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm4MB
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBetterBlockAsm4MB
+
+candidateS_match_encodeBetterBlockAsm4MB:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x2f, R10
+	MOVL  (AX)(R10*4), SI
+	INCL  DX
+	MOVL  DX, (AX)(R10*4)
+	CMPL  (BX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm4MB
+	DECL  DX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm4MB:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm4MB
+
+match_extend_back_loop_encodeBetterBlockAsm4MB:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBetterBlockAsm4MB
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBetterBlockAsm4MB
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm4MB
+	JMP  match_extend_back_loop_encodeBetterBlockAsm4MB
+
+match_extend_back_end_encodeBetterBlockAsm4MB:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 4(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm4MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm4MB:
+	MOVL DX, DI
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL R8, $0x10
+	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm4MB
+	MOVQ (R9)(R12*1), R11
+	MOVQ 8(R9)(R12*1), R13
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
+	XORQ 8(R10)(R12*1), R13
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB
+	LEAL -16(R8), R8
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL R8, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
+	MOVQ (R9)(R12*1), R11
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL R8, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
+	MOVL (R9)(R12*1), R11
+	CMPL (R10)(R12*1), R11
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
+	LEAL -4(R8), R8
+	LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL R8, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
+	JB   match_nolit_end_encodeBetterBlockAsm4MB
+	MOVW (R9)(R12*1), R11
+	CMPW (R10)(R12*1), R11
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
+	LEAL 2(R12), R12
+	SUBL $0x02, R8
+	JZ   match_nolit_end_encodeBetterBlockAsm4MB
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeBetterBlockAsm4MB
+	LEAL 1(R12), R12
+
+match_nolit_end_encodeBetterBlockAsm4MB:
+	MOVL DX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL 16(SP), R8
+	JEQ  match_is_repeat_encodeBetterBlockAsm4MB
+	CMPL R12, $0x01
+	JA   match_length_ok_encodeBetterBlockAsm4MB
+	CMPL R8, $0x0000ffff
+	JBE  match_length_ok_encodeBetterBlockAsm4MB
+	MOVL 20(SP), DX
+	INCL DX
+	JMP  search_loop_encodeBetterBlockAsm4MB
+
+match_length_ok_encodeBetterBlockAsm4MB:
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_encodeBetterBlockAsm4MB
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm4MB
+	CMPL SI, $0x00010000
+	JB   three_bytes_match_emit_encodeBetterBlockAsm4MB
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R11, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+three_bytes_match_emit_encodeBetterBlockAsm4MB:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+two_bytes_match_emit_encodeBetterBlockAsm4MB:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_encodeBetterBlockAsm4MB
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
+
+one_byte_match_emit_encodeBetterBlockAsm4MB:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeBetterBlockAsm4MB:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (CX)
+	MOVL R10, -4(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
+
+memmove_long_match_emit_encodeBetterBlockAsm4MB:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL R8, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
+	CMPL R12, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
+	MOVB $0xff, (CX)
+	MOVL R8, 1(CX)
+	LEAL -64(R12), R12
+	ADDQ $0x05, CX
+	CMPL R12, $0x04
+	JB   four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	CMPL R12, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (CX)
+	MOVW R12, 2(CX)
+	SARL $0x10, R8
+	MOVB R8, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
+	TESTL R12, R12
+	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+	XORL  SI, SI
+	LEAL  -1(SI)(R12*4), R12
+	MOVB  R12, (CX)
+	MOVL  R8, 1(CX)
+	ADDQ  $0x05, CX
+	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
+	CMPL R12, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
+	CMPL R8, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm4MB
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, R12
+
+	// emitRepeat
+	LEAL -4(R12), R12
+	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+	CMPL R12, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (CX)
+	MOVW R12, 2(CX)
+	SARL $0x10, R8
+	MOVB R8, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
+	MOVB $0xee, (CX)
+	MOVW R8, 1(CX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	CMPL R12, $0x00010100
+	JB   repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (CX)
+	MOVW R12, 2(CX)
+	SARL $0x10, R8
+	MOVB R8, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
+	MOVL R12, SI
+	SHLL $0x02, SI
+	CMPL R12, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
+	LEAL -15(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
+	LEAL -2(SI), SI
+	MOVB SI, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+match_is_repeat_encodeBetterBlockAsm4MB:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
+	CMPL SI, $0x00010000
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R11, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_repeat_encodeBetterBlockAsm4MB
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (CX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (CX)
+	MOVL R10, -4(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
+	CMPL R12, $0x00010100
+	JB   repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
+	LEAL -65536(R12), R12
+	MOVL R12, R8
+	MOVW $0x001d, (CX)
+	MOVW R12, 2(CX)
+	SARL $0x10, R8
+	MOVB R8, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm4MB
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm4MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm4MB:
+	MOVQ  $0x00cf1bbcdcbfa563, SI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(DI), DI
+	LEAQ  -2(DX), R9
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  1(BX)(DI*1), R11
+	MOVQ  (BX)(R9*1), R12
+	MOVQ  1(BX)(R9*1), R13
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x08, R12
+	IMULQ SI, R12
+	SHRQ  $0x2f, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x32, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R9, (AX)(R12*4)
+	MOVL  R8, 524288(AX)(R11*4)
+	MOVL  R14, 524288(AX)(R13*4)
+	LEAQ  1(R9)(DI*1), R8
+	SHRQ  $0x01, R8
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeBetterBlockAsm4MB:
+	CMPQ  R8, R9
+	JAE   search_loop_encodeBetterBlockAsm4MB
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x08, R11
+	IMULQ SI, R11
+	SHRQ  $0x2f, R11
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R8, (AX)(R11*4)
+	ADDQ  $0x02, DI
+	ADDQ  $0x02, R8
+	JMP   index_loop_encodeBetterBlockAsm4MB
+
+emit_remainder_encodeBetterBlockAsm4MB:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 4(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm4MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm4MB:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm4MB
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm4MB
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm4MB
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (CX)
+	MOVW DX, 1(CX)
+	MOVB BL, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBetterBlockAsm4MB
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
+
+one_byte_emit_remainder_encodeBetterBlockAsm4MB:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeBetterBlockAsm4MB:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
+
+memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBetterBlockAsm12B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000280, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm12B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -6(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBetterBlockAsm12B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  1(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm12B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x34, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  65536(AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, 65536(AX)(R11*4)
+	MOVQ  (BX)(SI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeBetterBlockAsm12B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeBetterBlockAsm12B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeBetterBlockAsm12B
+
+no_short_found_encodeBetterBlockAsm12B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm12B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm12B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBetterBlockAsm12B
+
+candidateS_match_encodeBetterBlockAsm12B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  (AX)(R10*4), SI
+	INCL  DX
+	MOVL  DX, (AX)(R10*4)
+	CMPL  (BX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm12B
+	DECL  DX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm12B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm12B
+
+match_extend_back_loop_encodeBetterBlockAsm12B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBetterBlockAsm12B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBetterBlockAsm12B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm12B
+	JMP  match_extend_back_loop_encodeBetterBlockAsm12B
+
+match_extend_back_end_encodeBetterBlockAsm12B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm12B:
+	MOVL DX, DI
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm12B
+	MOVQ (R9)(R12*1), R11
+	MOVQ 8(R9)(R12*1), R13
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
+	XORQ 8(R10)(R12*1), R13
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B
+	LEAL -16(R8), R8
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm12B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm12B
+	MOVQ (R9)(R12*1), R11
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm12B
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm12B
+	MOVL (R9)(R12*1), R11
+	CMPL (R10)(R12*1), R11
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm12B
+	LEAL -4(R8), R8
+	LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm12B
+	JB   match_nolit_end_encodeBetterBlockAsm12B
+	MOVW (R9)(R12*1), R11
+	CMPW (R10)(R12*1), R11
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm12B
+	LEAL 2(R12), R12
+	SUBL $0x02, R8
+	JZ   match_nolit_end_encodeBetterBlockAsm12B
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeBetterBlockAsm12B
+	LEAL 1(R12), R12
+
+match_nolit_end_encodeBetterBlockAsm12B:
+	MOVL DX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL 16(SP), R8
+	JEQ  match_is_repeat_encodeBetterBlockAsm12B
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_encodeBetterBlockAsm12B
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm12B
+	JB   three_bytes_match_emit_encodeBetterBlockAsm12B
+
+three_bytes_match_emit_encodeBetterBlockAsm12B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B
+
+two_bytes_match_emit_encodeBetterBlockAsm12B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_encodeBetterBlockAsm12B
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B
+
+one_byte_match_emit_encodeBetterBlockAsm12B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeBetterBlockAsm12B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (CX)
+	MOVL R10, -4(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm12B
+
+memmove_long_match_emit_encodeBetterBlockAsm12B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm12B:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL R12, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
+	CMPL R8, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm12B
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, R12
+
+	// emitRepeat
+	LEAL -4(R12), R12
+	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+long_offset_short_match_nolit_encodeBetterBlockAsm12B:
+	MOVB $0xee, (CX)
+	MOVW R8, 1(CX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
+	MOVL R12, SI
+	SHLL $0x02, SI
+	CMPL R12, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
+	LEAL -15(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
+	LEAL -2(SI), SI
+	MOVB SI, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+match_is_repeat_encodeBetterBlockAsm12B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm12B
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm12B
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_repeat_encodeBetterBlockAsm12B
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm12B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (CX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (CX)
+	MOVL R10, -4(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm12B
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm12B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(DI), DI
+	LEAQ  -2(DX), R9
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  1(BX)(DI*1), R11
+	MOVQ  (BX)(R9*1), R12
+	MOVQ  1(BX)(R9*1), R13
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x34, R11
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x32, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x34, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R9, (AX)(R12*4)
+	MOVL  R8, 65536(AX)(R11*4)
+	MOVL  R14, 65536(AX)(R13*4)
+	LEAQ  1(R9)(DI*1), R8
+	SHRQ  $0x01, R8
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeBetterBlockAsm12B:
+	CMPQ  R8, R9
+	JAE   search_loop_encodeBetterBlockAsm12B
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R11
+	IMULQ SI, R11
+	SHRQ  $0x32, R11
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R8, (AX)(R11*4)
+	ADDQ  $0x02, DI
+	ADDQ  $0x02, R8
+	JMP   index_loop_encodeBetterBlockAsm12B
+
+emit_remainder_encodeBetterBlockAsm12B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm12B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm12B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm12B
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm12B
+
+three_bytes_emit_remainder_encodeBetterBlockAsm12B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm12B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBetterBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B
+
+one_byte_emit_remainder_encodeBetterBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeBetterBlockAsm12B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm12B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBetterBlockAsm10B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x000000a0, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm10B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -6(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBetterBlockAsm10B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  1(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm10B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x36, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  16384(AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, 16384(AX)(R11*4)
+	MOVQ  (BX)(SI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeBetterBlockAsm10B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeBetterBlockAsm10B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeBetterBlockAsm10B
+
+no_short_found_encodeBetterBlockAsm10B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm10B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm10B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBetterBlockAsm10B
+
+candidateS_match_encodeBetterBlockAsm10B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	MOVL  (AX)(R10*4), SI
+	INCL  DX
+	MOVL  DX, (AX)(R10*4)
+	CMPL  (BX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm10B
+	DECL  DX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm10B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm10B
+
+match_extend_back_loop_encodeBetterBlockAsm10B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBetterBlockAsm10B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBetterBlockAsm10B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm10B
+	JMP  match_extend_back_loop_encodeBetterBlockAsm10B
+
+match_extend_back_end_encodeBetterBlockAsm10B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm10B:
+	MOVL DX, DI
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm10B
+	MOVQ (R9)(R12*1), R11
+	MOVQ 8(R9)(R12*1), R13
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
+	XORQ 8(R10)(R12*1), R13
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B
+	LEAL -16(R8), R8
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm10B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm10B
+	MOVQ (R9)(R12*1), R11
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm10B
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm10B
+	MOVL (R9)(R12*1), R11
+	CMPL (R10)(R12*1), R11
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm10B
+	LEAL -4(R8), R8
+	LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm10B
+	JB   match_nolit_end_encodeBetterBlockAsm10B
+	MOVW (R9)(R12*1), R11
+	CMPW (R10)(R12*1), R11
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm10B
+	LEAL 2(R12), R12
+	SUBL $0x02, R8
+	JZ   match_nolit_end_encodeBetterBlockAsm10B
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeBetterBlockAsm10B
+	LEAL 1(R12), R12
+
+match_nolit_end_encodeBetterBlockAsm10B:
+	MOVL DX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL 16(SP), R8
+	JEQ  match_is_repeat_encodeBetterBlockAsm10B
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_encodeBetterBlockAsm10B
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm10B
+	JB   three_bytes_match_emit_encodeBetterBlockAsm10B
+
+three_bytes_match_emit_encodeBetterBlockAsm10B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B
+
+two_bytes_match_emit_encodeBetterBlockAsm10B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_encodeBetterBlockAsm10B
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B
+
+one_byte_match_emit_encodeBetterBlockAsm10B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeBetterBlockAsm10B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (CX)
+	MOVL R10, -4(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm10B
+
+memmove_long_match_emit_encodeBetterBlockAsm10B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm10B:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL R12, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
+	CMPL R8, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm10B
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, R12
+
+	// emitRepeat
+	LEAL -4(R12), R12
+	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+long_offset_short_match_nolit_encodeBetterBlockAsm10B:
+	MOVB $0xee, (CX)
+	MOVW R8, 1(CX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
+	MOVL R12, SI
+	SHLL $0x02, SI
+	CMPL R12, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
+	LEAL -15(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
+	LEAL -2(SI), SI
+	MOVB SI, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+match_is_repeat_encodeBetterBlockAsm10B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm10B
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm10B
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_repeat_encodeBetterBlockAsm10B
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm10B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (CX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (CX)
+	MOVL R10, -4(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
+	CMPL R8, $0x00000800
+	JB   repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
+
+repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm10B
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm10B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(DI), DI
+	LEAQ  -2(DX), R9
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  1(BX)(DI*1), R11
+	MOVQ  (BX)(R9*1), R12
+	MOVQ  1(BX)(R9*1), R13
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x36, R11
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x34, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x36, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R9, (AX)(R12*4)
+	MOVL  R8, 16384(AX)(R11*4)
+	MOVL  R14, 16384(AX)(R13*4)
+	LEAQ  1(R9)(DI*1), R8
+	SHRQ  $0x01, R8
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeBetterBlockAsm10B:
+	CMPQ  R8, R9
+	JAE   search_loop_encodeBetterBlockAsm10B
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x10, R11
+	IMULQ SI, R11
+	SHRQ  $0x34, R11
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R8, (AX)(R11*4)
+	ADDQ  $0x02, DI
+	ADDQ  $0x02, R8
+	JMP   index_loop_encodeBetterBlockAsm10B
+
+emit_remainder_encodeBetterBlockAsm10B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm10B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm10B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm10B
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm10B
+
+three_bytes_emit_remainder_encodeBetterBlockAsm10B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm10B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBetterBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B
+
+one_byte_emit_remainder_encodeBetterBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeBetterBlockAsm10B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm10B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeBetterBlockAsm8B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000028, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm8B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -6(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBetterBlockAsm8B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x04, SI
+	LEAL  1(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm8B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x38, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  4096(AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, 4096(AX)(R11*4)
+	MOVQ  (BX)(SI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeBetterBlockAsm8B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeBetterBlockAsm8B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeBetterBlockAsm8B
+
+no_short_found_encodeBetterBlockAsm8B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm8B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm8B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBetterBlockAsm8B
+
+candidateS_match_encodeBetterBlockAsm8B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	MOVL  (AX)(R10*4), SI
+	INCL  DX
+	MOVL  DX, (AX)(R10*4)
+	CMPL  (BX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm8B
+	DECL  DX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm8B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm8B
+
+match_extend_back_loop_encodeBetterBlockAsm8B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBetterBlockAsm8B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBetterBlockAsm8B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm8B
+	JMP  match_extend_back_loop_encodeBetterBlockAsm8B
+
+match_extend_back_end_encodeBetterBlockAsm8B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm8B:
+	MOVL DX, DI
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_match_nolit_encodeBetterBlockAsm8B
+	MOVQ (R9)(R12*1), R11
+	MOVQ 8(R9)(R12*1), R13
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
+	XORQ 8(R10)(R12*1), R13
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B
+	LEAL -16(R8), R8
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm8B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm8B
+	MOVQ (R9)(R12*1), R11
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm8B
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm8B
+	MOVL (R9)(R12*1), R11
+	CMPL (R10)(R12*1), R11
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm8B
+	LEAL -4(R8), R8
+	LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm8B
+	JB   match_nolit_end_encodeBetterBlockAsm8B
+	MOVW (R9)(R12*1), R11
+	CMPW (R10)(R12*1), R11
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm8B
+	LEAL 2(R12), R12
+	SUBL $0x02, R8
+	JZ   match_nolit_end_encodeBetterBlockAsm8B
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeBetterBlockAsm8B
+	LEAL 1(R12), R12
+
+match_nolit_end_encodeBetterBlockAsm8B:
+	MOVL DX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL 16(SP), R8
+	JEQ  match_is_repeat_encodeBetterBlockAsm8B
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm8B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_encodeBetterBlockAsm8B
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm8B
+	JB   three_bytes_match_emit_encodeBetterBlockAsm8B
+
+three_bytes_match_emit_encodeBetterBlockAsm8B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B
+
+two_bytes_match_emit_encodeBetterBlockAsm8B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_encodeBetterBlockAsm8B
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B
+
+one_byte_match_emit_encodeBetterBlockAsm8B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeBetterBlockAsm8B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x04
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
+	CMPQ R9, $0x08
+	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
+	MOVL (R10), R11
+	MOVL R11, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
+	MOVL (R10), R11
+	MOVL -4(R10)(R9*1), R10
+	MOVL R11, (CX)
+	MOVL R10, -4(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm8B
+
+memmove_long_match_emit_encodeBetterBlockAsm8B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_encodeBetterBlockAsm8B:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL R12, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
+	CMPL R8, $0x00000800
+	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm8B
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x08, R12
+
+	// emitRepeat
+	LEAL -4(R12), R12
+	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+long_offset_short_match_nolit_encodeBetterBlockAsm8B:
+	MOVB $0xee, (CX)
+	MOVW R8, 1(CX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, CX
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+
+cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
+	MOVL R12, SI
+	SHLL $0x02, SI
+	CMPL R12, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeBetterBlockAsm8B
+	LEAL -15(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
+	LEAL -2(SI), SI
+	MOVB SI, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+match_is_repeat_encodeBetterBlockAsm8B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm8B
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm8B
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_repeat_encodeBetterBlockAsm8B
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_repeat_encodeBetterBlockAsm8B:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x04
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
+	CMPQ R8, $0x08
+	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
+	MOVL (R9), R10
+	MOVL R10, (CX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
+	MOVL (R9), R10
+	MOVL -4(R9)(R8*1), R9
+	MOVL R10, (CX)
+	MOVL R9, -4(CX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (CX)
+	MOVQ R9, -8(CX)(R8*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R13*1), X4
+	MOVOU -16(R9)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitRepeat
+	MOVL R12, SI
+	LEAL -4(R12), R12
+	CMPL SI, $0x08
+	JBE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
+
+cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
+	CMPL R12, $0x00000104
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
+	LEAL -256(R12), R12
+	MOVW $0x0019, (CX)
+	MOVW R12, 2(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
+	LEAL -4(R12), R12
+	MOVW $0x0015, (CX)
+	MOVB R12, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
+	SHLL $0x02, R12
+	ORL  $0x01, R12
+	MOVW R12, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
+	XORQ SI, SI
+	LEAL 1(SI)(R12*4), R12
+	MOVB R8, 1(CX)
+	SARL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, R12
+	MOVB R12, (CX)
+	ADDQ $0x02, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm8B
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm8B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(DI), DI
+	LEAQ  -2(DX), R9
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  1(BX)(DI*1), R11
+	MOVQ  (BX)(R9*1), R12
+	MOVQ  1(BX)(R9*1), R13
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x38, R11
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x36, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x38, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R9, (AX)(R12*4)
+	MOVL  R8, 4096(AX)(R11*4)
+	MOVL  R14, 4096(AX)(R13*4)
+	LEAQ  1(R9)(DI*1), R8
+	SHRQ  $0x01, R8
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeBetterBlockAsm8B:
+	CMPQ  R8, R9
+	JAE   search_loop_encodeBetterBlockAsm8B
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x10, R11
+	IMULQ SI, R11
+	SHRQ  $0x36, R11
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R8, (AX)(R11*4)
+	ADDQ  $0x02, DI
+	ADDQ  $0x02, R8
+	JMP   index_loop_encodeBetterBlockAsm8B
+
+emit_remainder_encodeBetterBlockAsm8B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm8B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm8B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm8B
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm8B
+
+three_bytes_emit_remainder_encodeBetterBlockAsm8B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B
+
+two_bytes_emit_remainder_encodeBetterBlockAsm8B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeBetterBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B
+
+one_byte_emit_remainder_encodeBetterBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeBetterBlockAsm8B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
+
+memmove_long_emit_remainder_encodeBetterBlockAsm8B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeSnappyBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBlockAsm(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000200, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeSnappyBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeSnappyBlockAsm:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeSnappyBlockAsm
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R11
+	IMULQ R9, R11
+	SHRQ  $0x32, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeSnappyBlockAsm
+	LEAL  1(DX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm
+
+repeat_extend_back_loop_encodeSnappyBlockAsm:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeSnappyBlockAsm
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm
+
+repeat_extend_back_end_encodeSnappyBlockAsm:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 5(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeSnappyBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeSnappyBlockAsm:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_encodeSnappyBlockAsm
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm
+	CMPL SI, $0x00010000
+	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm
+	CMPL SI, $0x01000000
+	JB   four_bytes_repeat_emit_encodeSnappyBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x05, CX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+four_bytes_repeat_emit_encodeSnappyBlockAsm:
+	MOVL SI, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_encodeSnappyBlockAsm
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
+
+one_byte_repeat_emit_encodeSnappyBlockAsm:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_encodeSnappyBlockAsm:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (CX)
+	MOVQ R9, -8(CX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm
+
+matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_match8_repeat_extend_encodeSnappyBlockAsm:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm
+
+matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm
+	JB   repeat_extend_forward_end_encodeSnappyBlockAsm
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeSnappyBlockAsm:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+	CMPL DI, $0x00010000
+	JB   two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
+
+four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
+	CMPL SI, $0x40
+	JBE  four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
+	MOVB $0xff, (CX)
+	MOVL DI, 1(CX)
+	LEAL -64(SI), SI
+	ADDQ $0x05, CX
+	CMPL SI, $0x04
+	JB   four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
+	JMP  four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
+
+four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
+	TESTL SI, SI
+	JZ    repeat_end_emit_encodeSnappyBlockAsm
+	XORL  R8, R8
+	LEAL  -1(R8)(SI*4), SI
+	MOVB  SI, (CX)
+	MOVL  DI, 1(CX)
+	ADDQ  $0x05, CX
+	JMP   repeat_end_emit_encodeSnappyBlockAsm
+
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
+	MOVB $0xee, (CX)
+	MOVW DI, 1(CX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
+	MOVL SI, R8
+	SHLL $0x02, R8
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
+	LEAL -15(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
+	LEAL -2(R8), R8
+	MOVB R8, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+
+repeat_end_emit_encodeSnappyBlockAsm:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm
+
+no_repeat_found_encodeSnappyBlockAsm:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeSnappyBlockAsm
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeSnappyBlockAsm
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeSnappyBlockAsm
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeSnappyBlockAsm
+
+candidate3_match_encodeSnappyBlockAsm:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeSnappyBlockAsm
+
+candidate2_match_encodeSnappyBlockAsm:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBlockAsm
+
+match_extend_back_loop_encodeSnappyBlockAsm:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeSnappyBlockAsm
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeSnappyBlockAsm
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBlockAsm
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm
+
+match_extend_back_end_encodeSnappyBlockAsm:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 5(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeSnappyBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBlockAsm
+	CMPL R8, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBlockAsm
+	CMPL R8, $0x00010000
+	JB   three_bytes_match_emit_encodeSnappyBlockAsm
+	CMPL R8, $0x01000000
+	JB   four_bytes_match_emit_encodeSnappyBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL R8, 1(CX)
+	ADDQ $0x05, CX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+four_bytes_match_emit_encodeSnappyBlockAsm:
+	MOVL R8, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R8, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+three_bytes_match_emit_encodeSnappyBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+two_bytes_match_emit_encodeSnappyBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_match_emit_encodeSnappyBlockAsm
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
+
+one_byte_match_emit_encodeSnappyBlockAsm:
+	SHLB $0x02, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeSnappyBlockAsm:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (CX)
+	MOVQ DI, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm
+
+memmove_long_match_emit_encodeSnappyBlockAsm:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm:
+match_nolit_loop_encodeSnappyBlockAsm:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm
+
+matchlen_bsf_16match_nolit_encodeSnappyBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_match8_match_nolit_encodeSnappyBlockAsm:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm
+
+matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm
+	JB   match_nolit_end_encodeSnappyBlockAsm
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeSnappyBlockAsm
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeSnappyBlockAsm
+	LEAL 1(R10), R10
+
+match_nolit_end_encodeSnappyBlockAsm:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL SI, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeSnappyBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
+	CMPL R10, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeSnappyBlockAsm
+	MOVB $0xff, (CX)
+	MOVL SI, 1(CX)
+	LEAL -64(R10), R10
+	ADDQ $0x05, CX
+	CMPL R10, $0x04
+	JB   four_bytes_remain_match_nolit_encodeSnappyBlockAsm
+	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
+
+four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
+	TESTL R10, R10
+	JZ    match_nolit_emitcopy_end_encodeSnappyBlockAsm
+	XORL  DI, DI
+	LEAL  -1(DI)(R10*4), R10
+	MOVB  R10, (CX)
+	MOVL  SI, 1(CX)
+	ADDQ  $0x05, CX
+	JMP   match_nolit_emitcopy_end_encodeSnappyBlockAsm
+
+two_byte_offset_match_nolit_encodeSnappyBlockAsm:
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
+	MOVB $0xee, (CX)
+	MOVW SI, 1(CX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
+	MOVL R10, DI
+	SHLL $0x02, DI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
+	LEAL -15(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm:
+	LEAL -2(DI), DI
+	MOVB DI, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBlockAsm
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm:
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x10, R8
+	IMULQ R9, R8
+	SHRQ  $0x32, R8
+	SHLQ  $0x10, SI
+	IMULQ R9, SI
+	SHRQ  $0x32, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm
+	INCL  DX
+	JMP   search_loop_encodeSnappyBlockAsm
+
+emit_remainder_encodeSnappyBlockAsm:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 5(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBlockAsm
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm
+	CMPL DX, $0x01000000
+	JB   four_bytes_emit_remainder_encodeSnappyBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL DX, 1(CX)
+	ADDQ $0x05, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+four_bytes_emit_remainder_encodeSnappyBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (CX)
+	MOVW DX, 1(CX)
+	MOVB BL, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBlockAsm
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
+
+one_byte_emit_remainder_encodeSnappyBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeSnappyBlockAsm:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeSnappyBlockAsm64K(dst []byte, src []byte, tmp *[65536]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBlockAsm64K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000200, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm64K:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeSnappyBlockAsm64K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeSnappyBlockAsm64K:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeSnappyBlockAsm64K
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R11
+	IMULQ R9, R11
+	SHRQ  $0x32, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeSnappyBlockAsm64K
+	LEAL  1(DX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm64K
+
+repeat_extend_back_loop_encodeSnappyBlockAsm64K:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeSnappyBlockAsm64K
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm64K
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm64K
+
+repeat_extend_back_end_encodeSnappyBlockAsm64K:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 3(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeSnappyBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeSnappyBlockAsm64K:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_encodeSnappyBlockAsm64K
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm64K
+	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm64K
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_encodeSnappyBlockAsm64K
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K
+
+one_byte_repeat_emit_encodeSnappyBlockAsm64K:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_encodeSnappyBlockAsm64K:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (CX)
+	MOVQ R9, -8(CX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K
+
+matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
+
+matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
+	JB   repeat_extend_forward_end_encodeSnappyBlockAsm64K
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm64K
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm64K
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeSnappyBlockAsm64K:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
+	MOVB $0xee, (CX)
+	MOVW DI, 1(CX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
+	MOVL SI, R8
+	SHLL $0x02, R8
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
+	LEAL -15(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm64K
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
+	LEAL -2(R8), R8
+	MOVB R8, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+
+repeat_end_emit_encodeSnappyBlockAsm64K:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm64K
+
+no_repeat_found_encodeSnappyBlockAsm64K:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeSnappyBlockAsm64K
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeSnappyBlockAsm64K
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeSnappyBlockAsm64K
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeSnappyBlockAsm64K
+
+candidate3_match_encodeSnappyBlockAsm64K:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeSnappyBlockAsm64K
+
+candidate2_match_encodeSnappyBlockAsm64K:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm64K:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBlockAsm64K
+
+match_extend_back_loop_encodeSnappyBlockAsm64K:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeSnappyBlockAsm64K
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeSnappyBlockAsm64K
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBlockAsm64K
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm64K
+
+match_extend_back_end_encodeSnappyBlockAsm64K:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeSnappyBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm64K:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm64K
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBlockAsm64K
+	CMPL R8, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBlockAsm64K
+	JB   three_bytes_match_emit_encodeSnappyBlockAsm64K
+
+three_bytes_match_emit_encodeSnappyBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K
+
+two_bytes_match_emit_encodeSnappyBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_match_emit_encodeSnappyBlockAsm64K
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K
+
+one_byte_match_emit_encodeSnappyBlockAsm64K:
+	SHLB $0x02, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeSnappyBlockAsm64K:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (CX)
+	MOVQ DI, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm64K
+
+memmove_long_match_emit_encodeSnappyBlockAsm64K:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
+match_nolit_loop_encodeSnappyBlockAsm64K:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm64K
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K
+
+matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_match8_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
+
+matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
+	JB   match_nolit_end_encodeSnappyBlockAsm64K
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeSnappyBlockAsm64K
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeSnappyBlockAsm64K
+	LEAL 1(R10), R10
+
+match_nolit_end_encodeSnappyBlockAsm64K:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
+	MOVB $0xee, (CX)
+	MOVW SI, 1(CX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
+	MOVL R10, DI
+	SHLL $0x02, DI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
+	LEAL -15(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
+	LEAL -2(DI), DI
+	MOVB DI, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBlockAsm64K
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm64K:
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x10, R8
+	IMULQ R9, R8
+	SHRQ  $0x32, R8
+	SHLQ  $0x10, SI
+	IMULQ R9, SI
+	SHRQ  $0x32, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm64K
+	INCL  DX
+	JMP   search_loop_encodeSnappyBlockAsm64K
+
+emit_remainder_encodeSnappyBlockAsm64K:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm64K:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBlockAsm64K
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm64K
+	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm64K
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBlockAsm64K
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K
+
+one_byte_emit_remainder_encodeSnappyBlockAsm64K:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeSnappyBlockAsm64K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeSnappyBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBlockAsm12B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000080, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm12B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeSnappyBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeSnappyBlockAsm12B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeSnappyBlockAsm12B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x000000cf1bbcdcbb, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x18, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x18, R11
+	IMULQ R9, R11
+	SHRQ  $0x34, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x18, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeSnappyBlockAsm12B
+	LEAL  1(DX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm12B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm12B:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeSnappyBlockAsm12B
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm12B
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm12B
+
+repeat_extend_back_end_encodeSnappyBlockAsm12B:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 3(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeSnappyBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeSnappyBlockAsm12B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_encodeSnappyBlockAsm12B
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm12B
+	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm12B
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm12B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_encodeSnappyBlockAsm12B
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm12B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_encodeSnappyBlockAsm12B:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (CX)
+	MOVQ R9, -8(CX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B
+
+matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
+
+matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
+	JB   repeat_extend_forward_end_encodeSnappyBlockAsm12B
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm12B
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm12B
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeSnappyBlockAsm12B:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
+	MOVB $0xee, (CX)
+	MOVW DI, 1(CX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
+	MOVL SI, R8
+	SHLL $0x02, R8
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
+	LEAL -15(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm12B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
+	LEAL -2(R8), R8
+	MOVB R8, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+
+repeat_end_emit_encodeSnappyBlockAsm12B:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm12B
+
+no_repeat_found_encodeSnappyBlockAsm12B:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeSnappyBlockAsm12B
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeSnappyBlockAsm12B
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeSnappyBlockAsm12B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeSnappyBlockAsm12B
+
+candidate3_match_encodeSnappyBlockAsm12B:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeSnappyBlockAsm12B
+
+candidate2_match_encodeSnappyBlockAsm12B:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm12B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBlockAsm12B
+
+match_extend_back_loop_encodeSnappyBlockAsm12B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeSnappyBlockAsm12B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeSnappyBlockAsm12B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBlockAsm12B
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm12B
+
+match_extend_back_end_encodeSnappyBlockAsm12B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeSnappyBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm12B:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBlockAsm12B
+	CMPL R8, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBlockAsm12B
+	JB   three_bytes_match_emit_encodeSnappyBlockAsm12B
+
+three_bytes_match_emit_encodeSnappyBlockAsm12B:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B
+
+two_bytes_match_emit_encodeSnappyBlockAsm12B:
+	MOVB $0xf0, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_match_emit_encodeSnappyBlockAsm12B
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B
+
+one_byte_match_emit_encodeSnappyBlockAsm12B:
+	SHLB $0x02, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeSnappyBlockAsm12B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (CX)
+	MOVQ DI, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
+
+memmove_long_match_emit_encodeSnappyBlockAsm12B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
+match_nolit_loop_encodeSnappyBlockAsm12B:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm12B
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B
+
+matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_match8_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
+	JB   match_nolit_end_encodeSnappyBlockAsm12B
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeSnappyBlockAsm12B
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeSnappyBlockAsm12B
+	LEAL 1(R10), R10
+
+match_nolit_end_encodeSnappyBlockAsm12B:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
+	MOVB $0xee, (CX)
+	MOVW SI, 1(CX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
+	MOVL R10, DI
+	SHLL $0x02, DI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
+	LEAL -15(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
+	LEAL -2(DI), DI
+	MOVB DI, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBlockAsm12B
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm12B:
+	MOVQ  $0x000000cf1bbcdcbb, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x18, R8
+	IMULQ R9, R8
+	SHRQ  $0x34, R8
+	SHLQ  $0x18, SI
+	IMULQ R9, SI
+	SHRQ  $0x34, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm12B
+	INCL  DX
+	JMP   search_loop_encodeSnappyBlockAsm12B
+
+emit_remainder_encodeSnappyBlockAsm12B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm12B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBlockAsm12B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm12B
+	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm12B
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm12B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeSnappyBlockAsm12B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeSnappyBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBlockAsm10B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000020, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm10B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeSnappyBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeSnappyBlockAsm10B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeSnappyBlockAsm10B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ R9, R11
+	SHRQ  $0x36, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeSnappyBlockAsm10B
+	LEAL  1(DX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm10B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm10B:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeSnappyBlockAsm10B
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm10B
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm10B
+
+repeat_extend_back_end_encodeSnappyBlockAsm10B:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 3(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeSnappyBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeSnappyBlockAsm10B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_encodeSnappyBlockAsm10B
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm10B
+	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm10B
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm10B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_encodeSnappyBlockAsm10B
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm10B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_encodeSnappyBlockAsm10B:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (CX)
+	MOVQ R9, -8(CX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B
+
+matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
+
+matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
+	JB   repeat_extend_forward_end_encodeSnappyBlockAsm10B
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm10B
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm10B
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeSnappyBlockAsm10B:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
+	MOVB $0xee, (CX)
+	MOVW DI, 1(CX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
+	MOVL SI, R8
+	SHLL $0x02, R8
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
+	LEAL -15(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm10B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
+	LEAL -2(R8), R8
+	MOVB R8, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+
+repeat_end_emit_encodeSnappyBlockAsm10B:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm10B
+
+no_repeat_found_encodeSnappyBlockAsm10B:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeSnappyBlockAsm10B
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeSnappyBlockAsm10B
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeSnappyBlockAsm10B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeSnappyBlockAsm10B
+
+candidate3_match_encodeSnappyBlockAsm10B:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeSnappyBlockAsm10B
+
+candidate2_match_encodeSnappyBlockAsm10B:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm10B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBlockAsm10B
+
+match_extend_back_loop_encodeSnappyBlockAsm10B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeSnappyBlockAsm10B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeSnappyBlockAsm10B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBlockAsm10B
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm10B
+
+match_extend_back_end_encodeSnappyBlockAsm10B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeSnappyBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm10B:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBlockAsm10B
+	CMPL R8, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBlockAsm10B
+	JB   three_bytes_match_emit_encodeSnappyBlockAsm10B
+
+three_bytes_match_emit_encodeSnappyBlockAsm10B:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B
+
+two_bytes_match_emit_encodeSnappyBlockAsm10B:
+	MOVB $0xf0, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_match_emit_encodeSnappyBlockAsm10B
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B
+
+one_byte_match_emit_encodeSnappyBlockAsm10B:
+	SHLB $0x02, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeSnappyBlockAsm10B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (CX)
+	MOVQ DI, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
+
+memmove_long_match_emit_encodeSnappyBlockAsm10B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
+match_nolit_loop_encodeSnappyBlockAsm10B:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm10B
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B
+
+matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_match8_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
+	JB   match_nolit_end_encodeSnappyBlockAsm10B
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeSnappyBlockAsm10B
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeSnappyBlockAsm10B
+	LEAL 1(R10), R10
+
+match_nolit_end_encodeSnappyBlockAsm10B:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
+	MOVB $0xee, (CX)
+	MOVW SI, 1(CX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
+	MOVL R10, DI
+	SHLL $0x02, DI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
+	LEAL -15(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
+	LEAL -2(DI), DI
+	MOVB DI, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBlockAsm10B
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm10B:
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x20, R8
+	IMULQ R9, R8
+	SHRQ  $0x36, R8
+	SHLQ  $0x20, SI
+	IMULQ R9, SI
+	SHRQ  $0x36, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm10B
+	INCL  DX
+	JMP   search_loop_encodeSnappyBlockAsm10B
+
+emit_remainder_encodeSnappyBlockAsm10B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm10B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBlockAsm10B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm10B
+	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm10B
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm10B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeSnappyBlockAsm10B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeSnappyBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBlockAsm8B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000008, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBlockAsm8B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeSnappyBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeSnappyBlockAsm8B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x04, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeSnappyBlockAsm8B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x38, R10
+	SHLQ  $0x20, R11
+	IMULQ R9, R11
+	SHRQ  $0x38, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x38, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeSnappyBlockAsm8B
+	LEAL  1(DX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeSnappyBlockAsm8B
+
+repeat_extend_back_loop_encodeSnappyBlockAsm8B:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeSnappyBlockAsm8B
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeSnappyBlockAsm8B
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm8B
+
+repeat_extend_back_end_encodeSnappyBlockAsm8B:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 3(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeSnappyBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeSnappyBlockAsm8B:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_encodeSnappyBlockAsm8B
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeSnappyBlockAsm8B
+	JB   three_bytes_repeat_emit_encodeSnappyBlockAsm8B
+
+three_bytes_repeat_emit_encodeSnappyBlockAsm8B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B
+
+two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_encodeSnappyBlockAsm8B
+	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B
+
+one_byte_repeat_emit_encodeSnappyBlockAsm8B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_encodeSnappyBlockAsm8B:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
+	MOVQ (R9), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
+	MOVQ (R9), R10
+	MOVQ -8(R9)(R8*1), R9
+	MOVQ R10, (CX)
+	MOVQ R9, -8(CX)(R8*1)
+	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
+	MOVOU (R9), X0
+	MOVOU -16(R9)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
+
+memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
+	LEAQ (CX)(R8*1), SI
+
+	// genMemMoveLong
+	MOVOU (R9), X0
+	MOVOU 16(R9), X1
+	MOVOU -32(R9)(R8*1), X2
+	MOVOU -16(R9)(R8*1), X3
+	MOVQ  R8, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R9)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R9)(R12*1), X4
+	MOVOU -16(R9)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R8, R12
+	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B
+
+matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
+
+matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
+	JB   repeat_extend_forward_end_encodeSnappyBlockAsm8B
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeSnappyBlockAsm8B
+
+matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm8B
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeSnappyBlockAsm8B:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
+	MOVB $0xee, (CX)
+	MOVW DI, 1(CX)
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
+
+two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
+	MOVL SI, R8
+	SHLL $0x02, R8
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
+	LEAL -15(R8), R8
+	MOVB DI, 1(CX)
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, R8
+	MOVB R8, (CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeSnappyBlockAsm8B
+
+emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
+	LEAL -2(R8), R8
+	MOVB R8, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+
+repeat_end_emit_encodeSnappyBlockAsm8B:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeSnappyBlockAsm8B
+
+no_repeat_found_encodeSnappyBlockAsm8B:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeSnappyBlockAsm8B
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeSnappyBlockAsm8B
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeSnappyBlockAsm8B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeSnappyBlockAsm8B
+
+candidate3_match_encodeSnappyBlockAsm8B:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeSnappyBlockAsm8B
+
+candidate2_match_encodeSnappyBlockAsm8B:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeSnappyBlockAsm8B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBlockAsm8B
+
+match_extend_back_loop_encodeSnappyBlockAsm8B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeSnappyBlockAsm8B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeSnappyBlockAsm8B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBlockAsm8B
+	JMP  match_extend_back_loop_encodeSnappyBlockAsm8B
+
+match_extend_back_end_encodeSnappyBlockAsm8B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeSnappyBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBlockAsm8B:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), R8
+	CMPL R8, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBlockAsm8B
+	CMPL R8, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBlockAsm8B
+	JB   three_bytes_match_emit_encodeSnappyBlockAsm8B
+
+three_bytes_match_emit_encodeSnappyBlockAsm8B:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B
+
+two_bytes_match_emit_encodeSnappyBlockAsm8B:
+	MOVB $0xf0, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_match_emit_encodeSnappyBlockAsm8B
+	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B
+
+one_byte_match_emit_encodeSnappyBlockAsm8B:
+	SHLB $0x02, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeSnappyBlockAsm8B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
+	MOVQ (DI), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
+	MOVQ (DI), R10
+	MOVQ -8(DI)(R9*1), DI
+	MOVQ R10, (CX)
+	MOVQ DI, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
+
+memmove_long_match_emit_encodeSnappyBlockAsm8B:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R9*1), X2
+	MOVOU -16(DI)(R9*1), X3
+	MOVQ  R9, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R12*1), X4
+	MOVOU -16(DI)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  R9, R12
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
+match_nolit_loop_encodeSnappyBlockAsm8B:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBlockAsm8B
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B
+
+matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_match8_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
+	JB   match_nolit_end_encodeSnappyBlockAsm8B
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeSnappyBlockAsm8B
+
+matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_encodeSnappyBlockAsm8B
+	LEAL 1(R10), R10
+
+match_nolit_end_encodeSnappyBlockAsm8B:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
+	MOVB $0xee, (CX)
+	MOVW SI, 1(CX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
+	MOVL R10, DI
+	SHLL $0x02, DI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
+	LEAL -15(DI), DI
+	MOVB SI, 1(CX)
+	SHRL $0x08, SI
+	SHLL $0x05, SI
+	ORL  SI, DI
+	MOVB DI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
+
+emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
+	LEAL -2(DI), DI
+	MOVB DI, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBlockAsm8B
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBlockAsm8B:
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x20, R8
+	IMULQ R9, R8
+	SHRQ  $0x38, R8
+	SHLQ  $0x20, SI
+	IMULQ R9, SI
+	SHRQ  $0x38, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_encodeSnappyBlockAsm8B
+	INCL  DX
+	JMP   search_loop_encodeSnappyBlockAsm8B
+
+emit_remainder_encodeSnappyBlockAsm8B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBlockAsm8B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBlockAsm8B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBlockAsm8B
+	JB   three_bytes_emit_remainder_encodeSnappyBlockAsm8B
+
+three_bytes_emit_remainder_encodeSnappyBlockAsm8B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B
+
+two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B
+
+one_byte_emit_remainder_encodeSnappyBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeSnappyBlockAsm8B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
+
+memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBetterBlockAsm(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00001200, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeSnappyBetterBlockAsm:
+	MOVL DX, SI
+	SUBL 12(SP), SI
+	SHRL $0x07, SI
+	CMPL SI, $0x63
+	JBE  check_maxskip_ok_encodeSnappyBetterBlockAsm
+	LEAL 100(DX), SI
+	JMP  check_maxskip_cont_encodeSnappyBetterBlockAsm
+
+check_maxskip_ok_encodeSnappyBetterBlockAsm:
+	LEAL 1(DX)(SI*1), SI
+
+check_maxskip_cont_encodeSnappyBetterBlockAsm:
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeSnappyBetterBlockAsm
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x32, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  524288(AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, 524288(AX)(R11*4)
+	MOVQ  (BX)(SI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm
+	MOVL  R8, SI
+	JMP   candidate_match_encodeSnappyBetterBlockAsm
+
+no_short_found_encodeSnappyBetterBlockAsm:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeSnappyBetterBlockAsm
+
+candidateS_match_encodeSnappyBetterBlockAsm:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x2f, R10
+	MOVL  (AX)(R10*4), SI
+	INCL  DX
+	MOVL  DX, (AX)(R10*4)
+	CMPL  (BX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm
+	DECL  DX
+	MOVL  R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm
+
+match_extend_back_end_encodeSnappyBetterBlockAsm:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 5(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeSnappyBetterBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm:
+	MOVL DX, DI
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL R8, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm
+	MOVQ (R9)(R12*1), R11
+	MOVQ 8(R9)(R12*1), R13
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
+	XORQ 8(R10)(R12*1), R13
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm
+	LEAL -16(R8), R8
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm
+
+matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL R8, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
+	MOVQ (R9)(R12*1), R11
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
+
+matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL R8, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
+	MOVL (R9)(R12*1), R11
+	CMPL (R10)(R12*1), R11
+	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
+	LEAL -4(R8), R8
+	LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL R8, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
+	JB   match_nolit_end_encodeSnappyBetterBlockAsm
+	MOVW (R9)(R12*1), R11
+	CMPW (R10)(R12*1), R11
+	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
+	LEAL 2(R12), R12
+	SUBL $0x02, R8
+	JZ   match_nolit_end_encodeSnappyBetterBlockAsm
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm
+	LEAL 1(R12), R12
+
+match_nolit_end_encodeSnappyBetterBlockAsm:
+	MOVL DX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	CMPL R12, $0x01
+	JA   match_length_ok_encodeSnappyBetterBlockAsm
+	CMPL R8, $0x0000ffff
+	JBE  match_length_ok_encodeSnappyBetterBlockAsm
+	MOVL 20(SP), DX
+	INCL DX
+	JMP  search_loop_encodeSnappyBetterBlockAsm
+
+match_length_ok_encodeSnappyBetterBlockAsm:
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm
+	CMPL SI, $0x00010000
+	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm
+	CMPL SI, $0x01000000
+	JB   four_bytes_match_emit_encodeSnappyBetterBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x05, CX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+four_bytes_match_emit_encodeSnappyBetterBlockAsm:
+	MOVL SI, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R11, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_encodeSnappyBetterBlockAsm
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (CX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL R8, $0x00010000
+	JB   two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
+
+four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL R12, $0x40
+	JBE  four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
+	MOVB $0xff, (CX)
+	MOVL R8, 1(CX)
+	LEAL -64(R12), R12
+	ADDQ $0x05, CX
+	CMPL R12, $0x04
+	JB   four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
+	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
+
+four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
+	TESTL R12, R12
+	JZ    match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+	XORL  SI, SI
+	LEAL  -1(SI)(R12*4), R12
+	MOVB  R12, (CX)
+	MOVL  R8, 1(CX)
+	ADDQ  $0x05, CX
+	JMP   match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
+	CMPL R12, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
+	MOVB $0xee, (CX)
+	MOVW R8, 1(CX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
+	MOVL R12, SI
+	SHLL $0x02, SI
+	CMPL R12, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
+	LEAL -15(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
+	LEAL -2(SI), SI
+	MOVB SI, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBetterBlockAsm
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
+	MOVQ  $0x00cf1bbcdcbfa563, SI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(DI), DI
+	LEAQ  -2(DX), R9
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  1(BX)(DI*1), R11
+	MOVQ  (BX)(R9*1), R12
+	MOVQ  1(BX)(R9*1), R13
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x08, R12
+	IMULQ SI, R12
+	SHRQ  $0x2f, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x32, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R9, (AX)(R12*4)
+	MOVL  R8, 524288(AX)(R11*4)
+	MOVL  R14, 524288(AX)(R13*4)
+	LEAQ  1(R9)(DI*1), R8
+	SHRQ  $0x01, R8
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm:
+	CMPQ  R8, R9
+	JAE   search_loop_encodeSnappyBetterBlockAsm
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x08, R11
+	IMULQ SI, R11
+	SHRQ  $0x2f, R11
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R8, (AX)(R11*4)
+	ADDQ  $0x02, DI
+	ADDQ  $0x02, R8
+	JMP   index_loop_encodeSnappyBetterBlockAsm
+
+emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 5(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+	CMPL DX, $0x01000000
+	JB   four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL DX, 1(CX)
+	ADDQ $0x05, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (CX)
+	MOVW DX, 1(CX)
+	MOVB BL, 3(CX)
+	ADDQ $0x04, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte, tmp *[294912]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBetterBlockAsm64K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000900, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm64K:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm64K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeSnappyBetterBlockAsm64K:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x07, SI
+	LEAL  1(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeSnappyBetterBlockAsm64K
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x33, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  262144(AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, 262144(AX)(R11*4)
+	MOVQ  (BX)(SI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm64K
+	MOVL  R8, SI
+	JMP   candidate_match_encodeSnappyBetterBlockAsm64K
+
+no_short_found_encodeSnappyBetterBlockAsm64K:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm64K
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm64K
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeSnappyBetterBlockAsm64K
+
+candidateS_match_encodeSnappyBetterBlockAsm64K:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	MOVL  (AX)(R10*4), SI
+	INCL  DX
+	MOVL  DX, (AX)(R10*4)
+	CMPL  (BX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
+	DECL  DX
+	MOVL  R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm64K:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm64K
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm64K
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm64K
+
+match_extend_back_end_encodeSnappyBetterBlockAsm64K:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeSnappyBetterBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm64K:
+	MOVL DX, DI
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL R8, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K
+	MOVQ (R9)(R12*1), R11
+	MOVQ 8(R9)(R12*1), R13
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
+	XORQ 8(R10)(R12*1), R13
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K
+	LEAL -16(R8), R8
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K
+
+matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL R8, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
+	MOVQ (R9)(R12*1), R11
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
+
+matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL R8, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
+	MOVL (R9)(R12*1), R11
+	CMPL (R10)(R12*1), R11
+	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
+	LEAL -4(R8), R8
+	LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL R8, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
+	JB   match_nolit_end_encodeSnappyBetterBlockAsm64K
+	MOVW (R9)(R12*1), R11
+	CMPW (R10)(R12*1), R11
+	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
+	LEAL 2(R12), R12
+	SUBL $0x02, R8
+	JZ   match_nolit_end_encodeSnappyBetterBlockAsm64K
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm64K
+	LEAL 1(R12), R12
+
+match_nolit_end_encodeSnappyBetterBlockAsm64K:
+	MOVL DX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm64K
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
+	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm64K
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_encodeSnappyBetterBlockAsm64K
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm64K:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (CX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
+	CMPL R12, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
+	MOVB $0xee, (CX)
+	MOVW R8, 1(CX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
+	MOVL R12, SI
+	SHLL $0x02, SI
+	CMPL R12, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
+	LEAL -15(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
+	LEAL -2(SI), SI
+	MOVB SI, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBetterBlockAsm64K
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
+	MOVQ  $0x00cf1bbcdcbfa563, SI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(DI), DI
+	LEAQ  -2(DX), R9
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  1(BX)(DI*1), R11
+	MOVQ  (BX)(R9*1), R12
+	MOVQ  1(BX)(R9*1), R13
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x33, R11
+	SHLQ  $0x08, R12
+	IMULQ SI, R12
+	SHRQ  $0x30, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x33, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R9, (AX)(R12*4)
+	MOVL  R8, 262144(AX)(R11*4)
+	MOVL  R14, 262144(AX)(R13*4)
+	LEAQ  1(R9)(DI*1), R8
+	SHRQ  $0x01, R8
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm64K:
+	CMPQ  R8, R9
+	JAE   search_loop_encodeSnappyBetterBlockAsm64K
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	SHLQ  $0x08, R10
+	IMULQ SI, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x08, R11
+	IMULQ SI, R11
+	SHRQ  $0x30, R11
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R8, (AX)(R11*4)
+	ADDQ  $0x02, DI
+	ADDQ  $0x02, R8
+	JMP   index_loop_encodeSnappyBetterBlockAsm64K
+
+emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
+	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBetterBlockAsm12B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000280, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm12B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm12B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeSnappyBetterBlockAsm12B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  1(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeSnappyBetterBlockAsm12B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x34, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  65536(AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, 65536(AX)(R11*4)
+	MOVQ  (BX)(SI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm12B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeSnappyBetterBlockAsm12B
+
+no_short_found_encodeSnappyBetterBlockAsm12B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm12B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm12B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeSnappyBetterBlockAsm12B
+
+candidateS_match_encodeSnappyBetterBlockAsm12B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  (AX)(R10*4), SI
+	INCL  DX
+	MOVL  DX, (AX)(R10*4)
+	CMPL  (BX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
+	DECL  DX
+	MOVL  R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm12B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm12B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm12B
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm12B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm12B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeSnappyBetterBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm12B:
+	MOVL DX, DI
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B
+	MOVQ (R9)(R12*1), R11
+	MOVQ 8(R9)(R12*1), R13
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
+	XORQ 8(R10)(R12*1), R13
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B
+	LEAL -16(R8), R8
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B
+
+matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
+	MOVQ (R9)(R12*1), R11
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
+	MOVL (R9)(R12*1), R11
+	CMPL (R10)(R12*1), R11
+	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
+	LEAL -4(R8), R8
+	LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
+	JB   match_nolit_end_encodeSnappyBetterBlockAsm12B
+	MOVW (R9)(R12*1), R11
+	CMPW (R10)(R12*1), R11
+	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
+	LEAL 2(R12), R12
+	SUBL $0x02, R8
+	JZ   match_nolit_end_encodeSnappyBetterBlockAsm12B
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm12B
+	LEAL 1(R12), R12
+
+match_nolit_end_encodeSnappyBetterBlockAsm12B:
+	MOVL DX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm12B
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
+	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm12B
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_encodeSnappyBetterBlockAsm12B
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm12B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (CX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
+	CMPL R12, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
+	MOVB $0xee, (CX)
+	MOVW R8, 1(CX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
+	MOVL R12, SI
+	SHLL $0x02, SI
+	CMPL R12, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
+	LEAL -15(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
+	LEAL -2(SI), SI
+	MOVB SI, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBetterBlockAsm12B
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(DI), DI
+	LEAQ  -2(DX), R9
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  1(BX)(DI*1), R11
+	MOVQ  (BX)(R9*1), R12
+	MOVQ  1(BX)(R9*1), R13
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x34, R11
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x32, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x34, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R9, (AX)(R12*4)
+	MOVL  R8, 65536(AX)(R11*4)
+	MOVL  R14, 65536(AX)(R13*4)
+	LEAQ  1(R9)(DI*1), R8
+	SHRQ  $0x01, R8
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm12B:
+	CMPQ  R8, R9
+	JAE   search_loop_encodeSnappyBetterBlockAsm12B
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R11
+	IMULQ SI, R11
+	SHRQ  $0x32, R11
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R8, (AX)(R11*4)
+	ADDQ  $0x02, DI
+	ADDQ  $0x02, R8
+	JMP   index_loop_encodeSnappyBetterBlockAsm12B
+
+emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm12B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
+	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBetterBlockAsm10B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x000000a0, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm10B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm10B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeSnappyBetterBlockAsm10B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  1(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeSnappyBetterBlockAsm10B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x36, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  16384(AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, 16384(AX)(R11*4)
+	MOVQ  (BX)(SI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm10B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeSnappyBetterBlockAsm10B
+
+no_short_found_encodeSnappyBetterBlockAsm10B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm10B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm10B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeSnappyBetterBlockAsm10B
+
+candidateS_match_encodeSnappyBetterBlockAsm10B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x34, R10
+	MOVL  (AX)(R10*4), SI
+	INCL  DX
+	MOVL  DX, (AX)(R10*4)
+	CMPL  (BX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
+	DECL  DX
+	MOVL  R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm10B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm10B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm10B
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm10B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm10B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeSnappyBetterBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm10B:
+	MOVL DX, DI
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B
+	MOVQ (R9)(R12*1), R11
+	MOVQ 8(R9)(R12*1), R13
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
+	XORQ 8(R10)(R12*1), R13
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B
+	LEAL -16(R8), R8
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B
+
+matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
+	MOVQ (R9)(R12*1), R11
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
+	MOVL (R9)(R12*1), R11
+	CMPL (R10)(R12*1), R11
+	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
+	LEAL -4(R8), R8
+	LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
+	JB   match_nolit_end_encodeSnappyBetterBlockAsm10B
+	MOVW (R9)(R12*1), R11
+	CMPW (R10)(R12*1), R11
+	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
+	LEAL 2(R12), R12
+	SUBL $0x02, R8
+	JZ   match_nolit_end_encodeSnappyBetterBlockAsm10B
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm10B
+	LEAL 1(R12), R12
+
+match_nolit_end_encodeSnappyBetterBlockAsm10B:
+	MOVL DX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm10B
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
+	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm10B
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_encodeSnappyBetterBlockAsm10B
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm10B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (CX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
+	CMPL R12, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
+	MOVB $0xee, (CX)
+	MOVW R8, 1(CX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
+	MOVL R12, SI
+	SHLL $0x02, SI
+	CMPL R12, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
+	LEAL -15(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
+	LEAL -2(SI), SI
+	MOVB SI, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBetterBlockAsm10B
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(DI), DI
+	LEAQ  -2(DX), R9
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  1(BX)(DI*1), R11
+	MOVQ  (BX)(R9*1), R12
+	MOVQ  1(BX)(R9*1), R13
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x36, R11
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x34, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x36, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R9, (AX)(R12*4)
+	MOVL  R8, 16384(AX)(R11*4)
+	MOVL  R14, 16384(AX)(R13*4)
+	LEAQ  1(R9)(DI*1), R8
+	SHRQ  $0x01, R8
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm10B:
+	CMPQ  R8, R9
+	JAE   search_loop_encodeSnappyBetterBlockAsm10B
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x10, R11
+	IMULQ SI, R11
+	SHRQ  $0x34, R11
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R8, (AX)(R11*4)
+	ADDQ  $0x02, DI
+	ADDQ  $0x02, R8
+	JMP   index_loop_encodeSnappyBetterBlockAsm10B
+
+emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm10B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
+	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int
+// Requires: BMI, SSE2
+TEXT ·encodeSnappyBetterBlockAsm8B(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000028, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeSnappyBetterBlockAsm8B:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeSnappyBetterBlockAsm8B
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  $0x00000000, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeSnappyBetterBlockAsm8B:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x04, SI
+	LEAL  1(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeSnappyBetterBlockAsm8B
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x38, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  4096(AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, 4096(AX)(R11*4)
+	MOVQ  (BX)(SI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm8B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeSnappyBetterBlockAsm8B
+
+no_short_found_encodeSnappyBetterBlockAsm8B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm8B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm8B
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeSnappyBetterBlockAsm8B
+
+candidateS_match_encodeSnappyBetterBlockAsm8B:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x36, R10
+	MOVL  (AX)(R10*4), SI
+	INCL  DX
+	MOVL  DX, (AX)(R10*4)
+	CMPL  (BX)(SI*1), DI
+	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
+	DECL  DX
+	MOVL  R8, SI
+
+candidate_match_encodeSnappyBetterBlockAsm8B:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm8B
+
+match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm8B
+	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm8B
+
+match_extend_back_end_encodeSnappyBetterBlockAsm8B:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeSnappyBetterBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeSnappyBetterBlockAsm8B:
+	MOVL DX, DI
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), R10
+
+	// matchLen
+	XORL R12, R12
+
+matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL R8, $0x10
+	JB   matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B
+	MOVQ (R9)(R12*1), R11
+	MOVQ 8(R9)(R12*1), R13
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
+	XORQ 8(R10)(R12*1), R13
+	JNZ  matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B
+	LEAL -16(R8), R8
+	LEAL 16(R12), R12
+	JMP  matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B
+
+matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL R8, $0x08
+	JB   matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
+	MOVQ (R9)(R12*1), R11
+	XORQ (R10)(R12*1), R11
+	JNZ  matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
+	LEAL -8(R8), R8
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
+
+matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL R8, $0x04
+	JB   matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
+	MOVL (R9)(R12*1), R11
+	CMPL (R10)(R12*1), R11
+	JNE  matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
+	LEAL -4(R8), R8
+	LEAL 4(R12), R12
+
+matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL R8, $0x01
+	JE   matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
+	JB   match_nolit_end_encodeSnappyBetterBlockAsm8B
+	MOVW (R9)(R12*1), R11
+	CMPW (R10)(R12*1), R11
+	JNE  matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
+	LEAL 2(R12), R12
+	SUBL $0x02, R8
+	JZ   match_nolit_end_encodeSnappyBetterBlockAsm8B
+
+matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
+	MOVB (R9)(R12*1), R11
+	CMPB (R10)(R12*1), R11
+	JNE  match_nolit_end_encodeSnappyBetterBlockAsm8B
+	LEAL 1(R12), R12
+
+match_nolit_end_encodeSnappyBetterBlockAsm8B:
+	MOVL DX, R8
+	SUBL SI, R8
+
+	// Check if repeat
+	MOVL R8, 16(SP)
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R10
+	SUBL SI, R9
+	LEAL -1(R9), SI
+	CMPL SI, $0x3c
+	JB   one_byte_match_emit_encodeSnappyBetterBlockAsm8B
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
+	JB   three_bytes_match_emit_encodeSnappyBetterBlockAsm8B
+
+three_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
+
+two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
+	MOVB $0xf0, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_match_emit_encodeSnappyBetterBlockAsm8B
+	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
+
+one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
+	SHLB $0x02, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+memmove_match_emit_encodeSnappyBetterBlockAsm8B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
+	MOVQ (R10), R11
+	MOVQ R11, (CX)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (R10), R11
+	MOVQ -8(R10)(R9*1), R10
+	MOVQ R11, (CX)
+	MOVQ R10, -8(CX)(R9*1)
+	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
+
+memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
+	LEAQ (CX)(R9*1), SI
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
+	ADDL R12, DX
+	ADDL $0x04, R12
+	MOVL DX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
+	CMPL R12, $0x40
+	JBE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
+	MOVB $0xee, (CX)
+	MOVW R8, 1(CX)
+	LEAL -60(R12), R12
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
+
+two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
+	MOVL R12, SI
+	SHLL $0x02, SI
+	CMPL R12, $0x0c
+	JAE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
+	LEAL -15(SI), SI
+	MOVB R8, 1(CX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, SI
+	MOVB SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
+
+emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
+	LEAL -2(SI), SI
+	MOVB SI, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeSnappyBetterBlockAsm8B
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(DI), DI
+	LEAQ  -2(DX), R9
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  1(BX)(DI*1), R11
+	MOVQ  (BX)(R9*1), R12
+	MOVQ  1(BX)(R9*1), R13
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x38, R11
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x36, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x38, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R9, (AX)(R12*4)
+	MOVL  R8, 4096(AX)(R11*4)
+	MOVL  R14, 4096(AX)(R13*4)
+	LEAQ  1(R9)(DI*1), R8
+	SHRQ  $0x01, R8
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm8B:
+	CMPQ  R8, R9
+	JAE   search_loop_encodeSnappyBetterBlockAsm8B
+	MOVQ  (BX)(DI*1), R10
+	MOVQ  (BX)(R8*1), R11
+	SHLQ  $0x10, R10
+	IMULQ SI, R10
+	SHRQ  $0x36, R10
+	SHLQ  $0x10, R11
+	IMULQ SI, R11
+	SHRQ  $0x36, R11
+	MOVL  DI, (AX)(R10*4)
+	MOVL  R8, (AX)(R11*4)
+	ADDQ  $0x02, DI
+	ADDQ  $0x02, R8
+	JMP   index_loop_encodeSnappyBetterBlockAsm8B
+
+emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeSnappyBetterBlockAsm8B
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), DX
+	CMPL DX, $0x3c
+	JB   one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
+	JB   three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVB $0xf4, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVB $0xf0, (CX)
+	MOVB DL, 1(CX)
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
+	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	SHLB $0x02, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3
+	CMPQ BX, $0x08
+	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
+
+memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func calcBlockSize(src []byte, tmp *[32768]byte) int
+// Requires: BMI, SSE2
+TEXT ·calcBlockSize(SB), $24-40
+	MOVQ tmp+24(FP), AX
+	XORQ CX, CX
+	MOVQ $0x00000100, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_calcBlockSize:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_calcBlockSize
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+8(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+0(FP), BX
+
+search_loop_calcBlockSize:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x05, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_calcBlockSize
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x33, R10
+	SHLQ  $0x10, R11
+	IMULQ R9, R11
+	SHRQ  $0x33, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x33, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_calcBlockSize
+	LEAL  1(DX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_calcBlockSize
+
+repeat_extend_back_loop_calcBlockSize:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_calcBlockSize
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_calcBlockSize
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_calcBlockSize
+
+repeat_extend_back_end_calcBlockSize:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 5(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_calcBlockSize
+	MOVQ $0x00000000, ret+32(FP)
+	RET
+
+repeat_dst_size_check_calcBlockSize:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_calcBlockSize
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_calcBlockSize
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_calcBlockSize
+	CMPL SI, $0x00010000
+	JB   three_bytes_repeat_emit_calcBlockSize
+	CMPL SI, $0x01000000
+	JB   four_bytes_repeat_emit_calcBlockSize
+	ADDQ $0x05, CX
+	JMP  memmove_long_repeat_emit_calcBlockSize
+
+four_bytes_repeat_emit_calcBlockSize:
+	ADDQ $0x04, CX
+	JMP  memmove_long_repeat_emit_calcBlockSize
+
+three_bytes_repeat_emit_calcBlockSize:
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_calcBlockSize
+
+two_bytes_repeat_emit_calcBlockSize:
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_calcBlockSize
+	JMP  memmove_long_repeat_emit_calcBlockSize
+
+one_byte_repeat_emit_calcBlockSize:
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_calcBlockSize:
+	LEAQ (CX)(R8*1), CX
+	JMP  emit_literal_done_repeat_emit_calcBlockSize
+
+memmove_long_repeat_emit_calcBlockSize:
+	LEAQ (CX)(R8*1), CX
+
+emit_literal_done_repeat_emit_calcBlockSize:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+8(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_calcBlockSize:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_calcBlockSize
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSize
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_calcBlockSize
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_calcBlockSize
+
+matchlen_bsf_16repeat_extend_calcBlockSize:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_calcBlockSize
+
+matchlen_match8_repeat_extend_calcBlockSize:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_calcBlockSize
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSize
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_calcBlockSize
+
+matchlen_bsf_8_repeat_extend_calcBlockSize:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_calcBlockSize
+
+matchlen_match4_repeat_extend_calcBlockSize:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_calcBlockSize
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_calcBlockSize
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_calcBlockSize:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_calcBlockSize
+	JB   repeat_extend_forward_end_calcBlockSize
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_calcBlockSize
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_calcBlockSize
+
+matchlen_match1_repeat_extend_calcBlockSize:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_calcBlockSize
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_calcBlockSize:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+	CMPL DI, $0x00010000
+	JB   two_byte_offset_repeat_as_copy_calcBlockSize
+
+four_bytes_loop_back_repeat_as_copy_calcBlockSize:
+	CMPL SI, $0x40
+	JBE  four_bytes_remain_repeat_as_copy_calcBlockSize
+	LEAL -64(SI), SI
+	ADDQ $0x05, CX
+	CMPL SI, $0x04
+	JB   four_bytes_remain_repeat_as_copy_calcBlockSize
+	JMP  four_bytes_loop_back_repeat_as_copy_calcBlockSize
+
+four_bytes_remain_repeat_as_copy_calcBlockSize:
+	TESTL SI, SI
+	JZ    repeat_end_emit_calcBlockSize
+	XORL  SI, SI
+	ADDQ  $0x05, CX
+	JMP   repeat_end_emit_calcBlockSize
+
+two_byte_offset_repeat_as_copy_calcBlockSize:
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_calcBlockSize
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_repeat_as_copy_calcBlockSize
+
+two_byte_offset_short_repeat_as_copy_calcBlockSize:
+	MOVL SI, R8
+	SHLL $0x02, R8
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_calcBlockSize
+	CMPL DI, $0x00000800
+	JAE  emit_copy_three_repeat_as_copy_calcBlockSize
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_calcBlockSize
+
+emit_copy_three_repeat_as_copy_calcBlockSize:
+	ADDQ $0x03, CX
+
+repeat_end_emit_calcBlockSize:
+	MOVL DX, 12(SP)
+	JMP  search_loop_calcBlockSize
+
+no_repeat_found_calcBlockSize:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_calcBlockSize
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_calcBlockSize
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_calcBlockSize
+	MOVL 20(SP), DX
+	JMP  search_loop_calcBlockSize
+
+candidate3_match_calcBlockSize:
+	ADDL $0x02, DX
+	JMP  candidate_match_calcBlockSize
+
+candidate2_match_calcBlockSize:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_calcBlockSize:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_calcBlockSize
+
+match_extend_back_loop_calcBlockSize:
+	CMPL DX, DI
+	JBE  match_extend_back_end_calcBlockSize
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_calcBlockSize
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_calcBlockSize
+	JMP  match_extend_back_loop_calcBlockSize
+
+match_extend_back_end_calcBlockSize:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 5(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_calcBlockSize
+	MOVQ $0x00000000, ret+32(FP)
+	RET
+
+match_dst_size_check_calcBlockSize:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_calcBlockSize
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_calcBlockSize
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_calcBlockSize
+	CMPL DI, $0x00010000
+	JB   three_bytes_match_emit_calcBlockSize
+	CMPL DI, $0x01000000
+	JB   four_bytes_match_emit_calcBlockSize
+	ADDQ $0x05, CX
+	JMP  memmove_long_match_emit_calcBlockSize
+
+four_bytes_match_emit_calcBlockSize:
+	ADDQ $0x04, CX
+	JMP  memmove_long_match_emit_calcBlockSize
+
+three_bytes_match_emit_calcBlockSize:
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_calcBlockSize
+
+two_bytes_match_emit_calcBlockSize:
+	ADDQ $0x02, CX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_calcBlockSize
+	JMP  memmove_long_match_emit_calcBlockSize
+
+one_byte_match_emit_calcBlockSize:
+	ADDQ $0x01, CX
+
+memmove_match_emit_calcBlockSize:
+	LEAQ (CX)(R9*1), CX
+	JMP  emit_literal_done_match_emit_calcBlockSize
+
+memmove_long_match_emit_calcBlockSize:
+	LEAQ (CX)(R9*1), CX
+
+emit_literal_done_match_emit_calcBlockSize:
+match_nolit_loop_calcBlockSize:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+8(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_calcBlockSize:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_calcBlockSize
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_calcBlockSize
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_calcBlockSize
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_calcBlockSize
+
+matchlen_bsf_16match_nolit_calcBlockSize:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_calcBlockSize
+
+matchlen_match8_match_nolit_calcBlockSize:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_calcBlockSize
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_calcBlockSize
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_calcBlockSize
+
+matchlen_bsf_8_match_nolit_calcBlockSize:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_calcBlockSize
+
+matchlen_match4_match_nolit_calcBlockSize:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_calcBlockSize
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_calcBlockSize
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_calcBlockSize:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_calcBlockSize
+	JB   match_nolit_end_calcBlockSize
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_calcBlockSize
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_calcBlockSize
+
+matchlen_match1_match_nolit_calcBlockSize:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_calcBlockSize
+	LEAL 1(R10), R10
+
+match_nolit_end_calcBlockSize:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+	CMPL SI, $0x00010000
+	JB   two_byte_offset_match_nolit_calcBlockSize
+
+four_bytes_loop_back_match_nolit_calcBlockSize:
+	CMPL R10, $0x40
+	JBE  four_bytes_remain_match_nolit_calcBlockSize
+	LEAL -64(R10), R10
+	ADDQ $0x05, CX
+	CMPL R10, $0x04
+	JB   four_bytes_remain_match_nolit_calcBlockSize
+	JMP  four_bytes_loop_back_match_nolit_calcBlockSize
+
+four_bytes_remain_match_nolit_calcBlockSize:
+	TESTL R10, R10
+	JZ    match_nolit_emitcopy_end_calcBlockSize
+	XORL  SI, SI
+	ADDQ  $0x05, CX
+	JMP   match_nolit_emitcopy_end_calcBlockSize
+
+two_byte_offset_match_nolit_calcBlockSize:
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_calcBlockSize
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_calcBlockSize
+
+two_byte_offset_short_match_nolit_calcBlockSize:
+	MOVL R10, DI
+	SHLL $0x02, DI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_calcBlockSize
+	CMPL SI, $0x00000800
+	JAE  emit_copy_three_match_nolit_calcBlockSize
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_calcBlockSize
+
+emit_copy_three_match_nolit_calcBlockSize:
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_calcBlockSize:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_calcBlockSize
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_calcBlockSize
+	MOVQ $0x00000000, ret+32(FP)
+	RET
+
+match_nolit_dst_ok_calcBlockSize:
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x10, R8
+	IMULQ R9, R8
+	SHRQ  $0x33, R8
+	SHLQ  $0x10, SI
+	IMULQ R9, SI
+	SHRQ  $0x33, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_calcBlockSize
+	INCL  DX
+	JMP   search_loop_calcBlockSize
+
+emit_remainder_calcBlockSize:
+	MOVQ src_len+8(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 5(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_calcBlockSize
+	MOVQ $0x00000000, ret+32(FP)
+	RET
+
+emit_remainder_ok_calcBlockSize:
+	MOVQ src_len+8(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_calcBlockSize
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), AX
+	CMPL AX, $0x3c
+	JB   one_byte_emit_remainder_calcBlockSize
+	CMPL AX, $0x00000100
+	JB   two_bytes_emit_remainder_calcBlockSize
+	CMPL AX, $0x00010000
+	JB   three_bytes_emit_remainder_calcBlockSize
+	CMPL AX, $0x01000000
+	JB   four_bytes_emit_remainder_calcBlockSize
+	ADDQ $0x05, CX
+	JMP  memmove_long_emit_remainder_calcBlockSize
+
+four_bytes_emit_remainder_calcBlockSize:
+	ADDQ $0x04, CX
+	JMP  memmove_long_emit_remainder_calcBlockSize
+
+three_bytes_emit_remainder_calcBlockSize:
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_calcBlockSize
+
+two_bytes_emit_remainder_calcBlockSize:
+	ADDQ $0x02, CX
+	CMPL AX, $0x40
+	JB   memmove_emit_remainder_calcBlockSize
+	JMP  memmove_long_emit_remainder_calcBlockSize
+
+one_byte_emit_remainder_calcBlockSize:
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_calcBlockSize:
+	LEAQ (CX)(SI*1), AX
+	MOVQ AX, CX
+	JMP  emit_literal_done_emit_remainder_calcBlockSize
+
+memmove_long_emit_remainder_calcBlockSize:
+	LEAQ (CX)(SI*1), AX
+	MOVQ AX, CX
+
+emit_literal_done_emit_remainder_calcBlockSize:
+	MOVQ CX, ret+32(FP)
+	RET
+
+// func calcBlockSizeSmall(src []byte, tmp *[2048]byte) int
+// Requires: BMI, SSE2
+TEXT ·calcBlockSizeSmall(SB), $24-40
+	MOVQ tmp+24(FP), AX
+	XORQ CX, CX
+	MOVQ $0x00000010, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_calcBlockSizeSmall:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_calcBlockSizeSmall
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+8(FP), DX
+	LEAQ  -9(DX), BX
+	LEAQ  -8(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+0(FP), BX
+
+search_loop_calcBlockSizeSmall:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x04, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_calcBlockSizeSmall
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x37, R10
+	SHLQ  $0x20, R11
+	IMULQ R9, R11
+	SHRQ  $0x37, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	LEAL  1(DX), R10
+	MOVL  R10, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x20, R10
+	IMULQ R9, R10
+	SHRQ  $0x37, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_calcBlockSizeSmall
+	LEAL  1(DX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_calcBlockSizeSmall
+
+repeat_extend_back_loop_calcBlockSizeSmall:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_calcBlockSizeSmall
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_calcBlockSizeSmall
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_calcBlockSizeSmall
+
+repeat_extend_back_end_calcBlockSizeSmall:
+	MOVL DI, SI
+	SUBL 12(SP), SI
+	LEAQ 3(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_calcBlockSizeSmall
+	MOVQ $0x00000000, ret+32(FP)
+	RET
+
+repeat_dst_size_check_calcBlockSizeSmall:
+	MOVL 12(SP), SI
+	CMPL SI, DI
+	JEQ  emit_literal_done_repeat_emit_calcBlockSizeSmall
+	MOVL DI, R8
+	MOVL DI, 12(SP)
+	LEAQ (BX)(SI*1), R9
+	SUBL SI, R8
+	LEAL -1(R8), SI
+	CMPL SI, $0x3c
+	JB   one_byte_repeat_emit_calcBlockSizeSmall
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_calcBlockSizeSmall
+	JB   three_bytes_repeat_emit_calcBlockSizeSmall
+
+three_bytes_repeat_emit_calcBlockSizeSmall:
+	ADDQ $0x03, CX
+	JMP  memmove_long_repeat_emit_calcBlockSizeSmall
+
+two_bytes_repeat_emit_calcBlockSizeSmall:
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_repeat_emit_calcBlockSizeSmall
+	JMP  memmove_long_repeat_emit_calcBlockSizeSmall
+
+one_byte_repeat_emit_calcBlockSizeSmall:
+	ADDQ $0x01, CX
+
+memmove_repeat_emit_calcBlockSizeSmall:
+	LEAQ (CX)(R8*1), CX
+	JMP  emit_literal_done_repeat_emit_calcBlockSizeSmall
+
+memmove_long_repeat_emit_calcBlockSizeSmall:
+	LEAQ (CX)(R8*1), CX
+
+emit_literal_done_repeat_emit_calcBlockSizeSmall:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+8(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+
+matchlen_loopback_16_repeat_extend_calcBlockSizeSmall:
+	CMPL R8, $0x10
+	JB   matchlen_match8_repeat_extend_calcBlockSizeSmall
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_calcBlockSizeSmall
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+	JMP  matchlen_loopback_16_repeat_extend_calcBlockSizeSmall
+
+matchlen_bsf_16repeat_extend_calcBlockSizeSmall:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_calcBlockSizeSmall
+
+matchlen_match8_repeat_extend_calcBlockSizeSmall:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_calcBlockSizeSmall
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_calcBlockSizeSmall
+
+matchlen_bsf_8_repeat_extend_calcBlockSizeSmall:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_calcBlockSizeSmall
+
+matchlen_match4_repeat_extend_calcBlockSizeSmall:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_calcBlockSizeSmall
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_calcBlockSizeSmall
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_calcBlockSizeSmall:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_calcBlockSizeSmall
+	JB   repeat_extend_forward_end_calcBlockSizeSmall
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_calcBlockSizeSmall
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_calcBlockSizeSmall
+
+matchlen_match1_repeat_extend_calcBlockSizeSmall:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_calcBlockSizeSmall
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_calcBlockSizeSmall:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitCopy
+two_byte_offset_repeat_as_copy_calcBlockSizeSmall:
+	CMPL SI, $0x40
+	JBE  two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall
+	LEAL -60(SI), SI
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_repeat_as_copy_calcBlockSizeSmall
+
+two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall:
+	MOVL SI, DI
+	SHLL $0x02, DI
+	CMPL SI, $0x0c
+	JAE  emit_copy_three_repeat_as_copy_calcBlockSizeSmall
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_calcBlockSizeSmall
+
+emit_copy_three_repeat_as_copy_calcBlockSizeSmall:
+	ADDQ $0x03, CX
+
+repeat_end_emit_calcBlockSizeSmall:
+	MOVL DX, 12(SP)
+	JMP  search_loop_calcBlockSizeSmall
+
+no_repeat_found_calcBlockSizeSmall:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_calcBlockSizeSmall
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_calcBlockSizeSmall
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_calcBlockSizeSmall
+	MOVL 20(SP), DX
+	JMP  search_loop_calcBlockSizeSmall
+
+candidate3_match_calcBlockSizeSmall:
+	ADDL $0x02, DX
+	JMP  candidate_match_calcBlockSizeSmall
+
+candidate2_match_calcBlockSizeSmall:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_calcBlockSizeSmall:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_calcBlockSizeSmall
+
+match_extend_back_loop_calcBlockSizeSmall:
+	CMPL DX, DI
+	JBE  match_extend_back_end_calcBlockSizeSmall
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_calcBlockSizeSmall
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_calcBlockSizeSmall
+	JMP  match_extend_back_loop_calcBlockSizeSmall
+
+match_extend_back_end_calcBlockSizeSmall:
+	MOVL DX, DI
+	SUBL 12(SP), DI
+	LEAQ 3(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_calcBlockSizeSmall
+	MOVQ $0x00000000, ret+32(FP)
+	RET
+
+match_dst_size_check_calcBlockSizeSmall:
+	MOVL DX, DI
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_calcBlockSizeSmall
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (BX)(R8*1), DI
+	SUBL R8, R9
+	LEAL -1(R9), DI
+	CMPL DI, $0x3c
+	JB   one_byte_match_emit_calcBlockSizeSmall
+	CMPL DI, $0x00000100
+	JB   two_bytes_match_emit_calcBlockSizeSmall
+	JB   three_bytes_match_emit_calcBlockSizeSmall
+
+three_bytes_match_emit_calcBlockSizeSmall:
+	ADDQ $0x03, CX
+	JMP  memmove_long_match_emit_calcBlockSizeSmall
+
+two_bytes_match_emit_calcBlockSizeSmall:
+	ADDQ $0x02, CX
+	CMPL DI, $0x40
+	JB   memmove_match_emit_calcBlockSizeSmall
+	JMP  memmove_long_match_emit_calcBlockSizeSmall
+
+one_byte_match_emit_calcBlockSizeSmall:
+	ADDQ $0x01, CX
+
+memmove_match_emit_calcBlockSizeSmall:
+	LEAQ (CX)(R9*1), CX
+	JMP  emit_literal_done_match_emit_calcBlockSizeSmall
+
+memmove_long_match_emit_calcBlockSizeSmall:
+	LEAQ (CX)(R9*1), CX
+
+emit_literal_done_match_emit_calcBlockSizeSmall:
+match_nolit_loop_calcBlockSizeSmall:
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+8(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+
+matchlen_loopback_16_match_nolit_calcBlockSizeSmall:
+	CMPL DI, $0x10
+	JB   matchlen_match8_match_nolit_calcBlockSizeSmall
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_calcBlockSizeSmall
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16match_nolit_calcBlockSizeSmall
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+	JMP  matchlen_loopback_16_match_nolit_calcBlockSizeSmall
+
+matchlen_bsf_16match_nolit_calcBlockSizeSmall:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  match_nolit_end_calcBlockSizeSmall
+
+matchlen_match8_match_nolit_calcBlockSizeSmall:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_calcBlockSizeSmall
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_match_nolit_calcBlockSizeSmall
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_match_nolit_calcBlockSizeSmall
+
+matchlen_bsf_8_match_nolit_calcBlockSizeSmall:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  match_nolit_end_calcBlockSizeSmall
+
+matchlen_match4_match_nolit_calcBlockSizeSmall:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_calcBlockSizeSmall
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_match_nolit_calcBlockSizeSmall
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_match_nolit_calcBlockSizeSmall:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_calcBlockSizeSmall
+	JB   match_nolit_end_calcBlockSizeSmall
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_match_nolit_calcBlockSizeSmall
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   match_nolit_end_calcBlockSizeSmall
+
+matchlen_match1_match_nolit_calcBlockSizeSmall:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  match_nolit_end_calcBlockSizeSmall
+	LEAL 1(R10), R10
+
+match_nolit_end_calcBlockSizeSmall:
+	ADDL R10, DX
+	MOVL 16(SP), SI
+	ADDL $0x04, R10
+	MOVL DX, 12(SP)
+
+	// emitCopy
+two_byte_offset_match_nolit_calcBlockSizeSmall:
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_match_nolit_calcBlockSizeSmall
+	LEAL -60(R10), R10
+	ADDQ $0x03, CX
+	JMP  two_byte_offset_match_nolit_calcBlockSizeSmall
+
+two_byte_offset_short_match_nolit_calcBlockSizeSmall:
+	MOVL R10, SI
+	SHLL $0x02, SI
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_match_nolit_calcBlockSizeSmall
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_calcBlockSizeSmall
+
+emit_copy_three_match_nolit_calcBlockSizeSmall:
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_calcBlockSizeSmall:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_calcBlockSizeSmall
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_calcBlockSizeSmall
+	MOVQ $0x00000000, ret+32(FP)
+	RET
+
+match_nolit_dst_ok_calcBlockSizeSmall:
+	MOVQ  $0x9e3779b1, R9
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, SI
+	SHLQ  $0x20, R8
+	IMULQ R9, R8
+	SHRQ  $0x37, R8
+	SHLQ  $0x20, SI
+	IMULQ R9, SI
+	SHRQ  $0x37, SI
+	LEAL  -2(DX), R9
+	LEAQ  (AX)(SI*4), R10
+	MOVL  (R10), SI
+	MOVL  R9, (AX)(R8*4)
+	MOVL  DX, (R10)
+	CMPL  (BX)(SI*1), DI
+	JEQ   match_nolit_loop_calcBlockSizeSmall
+	INCL  DX
+	JMP   search_loop_calcBlockSizeSmall
+
+emit_remainder_calcBlockSizeSmall:
+	MOVQ src_len+8(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_calcBlockSizeSmall
+	MOVQ $0x00000000, ret+32(FP)
+	RET
+
+emit_remainder_ok_calcBlockSizeSmall:
+	MOVQ src_len+8(FP), AX
+	MOVL 12(SP), DX
+	CMPL DX, AX
+	JEQ  emit_literal_done_emit_remainder_calcBlockSizeSmall
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (BX)(DX*1), AX
+	SUBL DX, SI
+	LEAL -1(SI), AX
+	CMPL AX, $0x3c
+	JB   one_byte_emit_remainder_calcBlockSizeSmall
+	CMPL AX, $0x00000100
+	JB   two_bytes_emit_remainder_calcBlockSizeSmall
+	JB   three_bytes_emit_remainder_calcBlockSizeSmall
+
+three_bytes_emit_remainder_calcBlockSizeSmall:
+	ADDQ $0x03, CX
+	JMP  memmove_long_emit_remainder_calcBlockSizeSmall
+
+two_bytes_emit_remainder_calcBlockSizeSmall:
+	ADDQ $0x02, CX
+	CMPL AX, $0x40
+	JB   memmove_emit_remainder_calcBlockSizeSmall
+	JMP  memmove_long_emit_remainder_calcBlockSizeSmall
+
+one_byte_emit_remainder_calcBlockSizeSmall:
+	ADDQ $0x01, CX
+
+memmove_emit_remainder_calcBlockSizeSmall:
+	LEAQ (CX)(SI*1), AX
+	MOVQ AX, CX
+	JMP  emit_literal_done_emit_remainder_calcBlockSizeSmall
+
+memmove_long_emit_remainder_calcBlockSizeSmall:
+	LEAQ (CX)(SI*1), AX
+	MOVQ AX, CX
+
+emit_literal_done_emit_remainder_calcBlockSizeSmall:
+	MOVQ CX, ret+32(FP)
+	RET
+
+// func emitLiteral(dst []byte, lit []byte) int
+// Requires: SSE2
+TEXT ·emitLiteral(SB), NOSPLIT, $0-56
+	MOVQ  lit_len+32(FP), DX
+	MOVQ  dst_base+0(FP), AX
+	MOVQ  lit_base+24(FP), CX
+	TESTQ DX, DX
+	JZ    emit_literal_end_standalone_skip
+	MOVL  DX, BX
+	LEAL  -1(DX), SI
+	CMPL  SI, $0x3c
+	JB    one_byte_standalone
+	CMPL  SI, $0x00000100
+	JB    two_bytes_standalone
+	CMPL  SI, $0x00010000
+	JB    three_bytes_standalone
+	CMPL  SI, $0x01000000
+	JB    four_bytes_standalone
+	MOVB  $0xfc, (AX)
+	MOVL  SI, 1(AX)
+	ADDQ  $0x05, BX
+	ADDQ  $0x05, AX
+	JMP   memmove_long_standalone
+
+four_bytes_standalone:
+	MOVL SI, DI
+	SHRL $0x10, DI
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB DI, 3(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  memmove_long_standalone
+
+three_bytes_standalone:
+	MOVB $0xf4, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  memmove_long_standalone
+
+two_bytes_standalone:
+	MOVB $0xf0, (AX)
+	MOVB SI, 1(AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JB   memmove_standalone
+	JMP  memmove_long_standalone
+
+one_byte_standalone:
+	SHLB $0x02, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, BX
+	ADDQ $0x01, AX
+
+memmove_standalone:
+	// genMemMoveShort
+	CMPQ DX, $0x03
+	JB   emit_lit_memmove_standalone_memmove_move_1or2
+	JE   emit_lit_memmove_standalone_memmove_move_3
+	CMPQ DX, $0x08
+	JB   emit_lit_memmove_standalone_memmove_move_4through7
+	CMPQ DX, $0x10
+	JBE  emit_lit_memmove_standalone_memmove_move_8through16
+	CMPQ DX, $0x20
+	JBE  emit_lit_memmove_standalone_memmove_move_17through32
+	JMP  emit_lit_memmove_standalone_memmove_move_33through64
+
+emit_lit_memmove_standalone_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(DX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_4through7:
+	MOVL (CX), SI
+	MOVL -4(CX)(DX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(DX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(DX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(DX*1), X2
+	MOVOU -16(CX)(DX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+	JMP emit_literal_end_standalone
+
+memmove_long_standalone:
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(DX*1), X2
+	MOVOU -16(CX)(DX*1), X3
+	MOVQ  DX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_standalonelarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_standalonelarge_big_loop_back
+
+emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  DX, R8
+	JAE   emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+	JMP emit_literal_end_standalone
+
+emit_literal_end_standalone_skip:
+	XORQ BX, BX
+
+emit_literal_end_standalone:
+	MOVQ BX, ret+48(FP)
+	RET
+
+// func emitRepeat(dst []byte, offset int, length int) int
+TEXT ·emitRepeat(SB), NOSPLIT, $0-48
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ offset+24(FP), CX
+	MOVQ length+32(FP), DX
+
+	// emitRepeat
+emit_repeat_again_standalone:
+	MOVL DX, SI
+	LEAL -4(DX), DX
+	CMPL SI, $0x08
+	JBE  repeat_two_standalone
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_standalone
+	CMPL CX, $0x00000800
+	JB   repeat_two_offset_standalone
+
+cant_repeat_two_offset_standalone:
+	CMPL DX, $0x00000104
+	JB   repeat_three_standalone
+	CMPL DX, $0x00010100
+	JB   repeat_four_standalone
+	CMPL DX, $0x0100ffff
+	JB   repeat_five_standalone
+	LEAL -16842747(DX), DX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	ADDQ $0x05, BX
+	JMP  emit_repeat_again_standalone
+
+repeat_five_standalone:
+	LEAL -65536(DX), DX
+	MOVL DX, CX
+	MOVW $0x001d, (AX)
+	MOVW DX, 2(AX)
+	SARL $0x10, CX
+	MOVB CL, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_repeat_end
+
+repeat_four_standalone:
+	LEAL -256(DX), DX
+	MOVW $0x0019, (AX)
+	MOVW DX, 2(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_repeat_end
+
+repeat_three_standalone:
+	LEAL -4(DX), DX
+	MOVW $0x0015, (AX)
+	MOVB DL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_repeat_end
+
+repeat_two_standalone:
+	SHLL $0x02, DX
+	ORL  $0x01, DX
+	MOVW DX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_repeat_end
+
+repeat_two_offset_standalone:
+	XORQ SI, SI
+	LEAL 1(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SARL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+
+gen_emit_repeat_end:
+	MOVQ BX, ret+40(FP)
+	RET
+
+// func emitCopy(dst []byte, offset int, length int) int
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ offset+24(FP), CX
+	MOVQ length+32(FP), DX
+
+	// emitCopy
+	CMPL CX, $0x00010000
+	JB   two_byte_offset_standalone
+	CMPL DX, $0x40
+	JBE  four_bytes_remain_standalone
+	MOVB $0xff, (AX)
+	MOVL CX, 1(AX)
+	LEAL -64(DX), DX
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	CMPL DX, $0x04
+	JB   four_bytes_remain_standalone
+
+	// emitRepeat
+emit_repeat_again_standalone_emit_copy:
+	MOVL DX, SI
+	LEAL -4(DX), DX
+	CMPL SI, $0x08
+	JBE  repeat_two_standalone_emit_copy
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_standalone_emit_copy
+	CMPL CX, $0x00000800
+	JB   repeat_two_offset_standalone_emit_copy
+
+cant_repeat_two_offset_standalone_emit_copy:
+	CMPL DX, $0x00000104
+	JB   repeat_three_standalone_emit_copy
+	CMPL DX, $0x00010100
+	JB   repeat_four_standalone_emit_copy
+	CMPL DX, $0x0100ffff
+	JB   repeat_five_standalone_emit_copy
+	LEAL -16842747(DX), DX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	ADDQ $0x05, BX
+	JMP  emit_repeat_again_standalone_emit_copy
+
+repeat_five_standalone_emit_copy:
+	LEAL -65536(DX), DX
+	MOVL DX, CX
+	MOVW $0x001d, (AX)
+	MOVW DX, 2(AX)
+	SARL $0x10, CX
+	MOVB CL, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_copy_end
+
+repeat_four_standalone_emit_copy:
+	LEAL -256(DX), DX
+	MOVW $0x0019, (AX)
+	MOVW DX, 2(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_copy_end
+
+repeat_three_standalone_emit_copy:
+	LEAL -4(DX), DX
+	MOVW $0x0015, (AX)
+	MOVB DL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_standalone_emit_copy:
+	SHLL $0x02, DX
+	ORL  $0x01, DX
+	MOVW DX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy:
+	XORQ SI, SI
+	LEAL 1(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SARL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+four_bytes_remain_standalone:
+	TESTL DX, DX
+	JZ    gen_emit_copy_end
+	XORL  SI, SI
+	LEAL  -1(SI)(DX*4), DX
+	MOVB  DL, (AX)
+	MOVL  CX, 1(AX)
+	ADDQ  $0x05, BX
+	ADDQ  $0x05, AX
+	JMP   gen_emit_copy_end
+
+two_byte_offset_standalone:
+	CMPL DX, $0x40
+	JBE  two_byte_offset_short_standalone
+	CMPL CX, $0x00000800
+	JAE  long_offset_short_standalone
+	MOVL $0x00000001, SI
+	LEAL 16(SI), SI
+	MOVB CL, 1(AX)
+	MOVL CX, DI
+	SHRL $0x08, DI
+	SHLL $0x05, DI
+	ORL  DI, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	SUBL $0x08, DX
+
+	// emitRepeat
+	LEAL -4(DX), DX
+	JMP  cant_repeat_two_offset_standalone_emit_copy_short_2b
+
+emit_repeat_again_standalone_emit_copy_short_2b:
+	MOVL DX, SI
+	LEAL -4(DX), DX
+	CMPL SI, $0x08
+	JBE  repeat_two_standalone_emit_copy_short_2b
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_standalone_emit_copy_short_2b
+	CMPL CX, $0x00000800
+	JB   repeat_two_offset_standalone_emit_copy_short_2b
+
+cant_repeat_two_offset_standalone_emit_copy_short_2b:
+	CMPL DX, $0x00000104
+	JB   repeat_three_standalone_emit_copy_short_2b
+	CMPL DX, $0x00010100
+	JB   repeat_four_standalone_emit_copy_short_2b
+	CMPL DX, $0x0100ffff
+	JB   repeat_five_standalone_emit_copy_short_2b
+	LEAL -16842747(DX), DX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	ADDQ $0x05, BX
+	JMP  emit_repeat_again_standalone_emit_copy_short_2b
+
+repeat_five_standalone_emit_copy_short_2b:
+	LEAL -65536(DX), DX
+	MOVL DX, CX
+	MOVW $0x001d, (AX)
+	MOVW DX, 2(AX)
+	SARL $0x10, CX
+	MOVB CL, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_copy_end
+
+repeat_four_standalone_emit_copy_short_2b:
+	LEAL -256(DX), DX
+	MOVW $0x0019, (AX)
+	MOVW DX, 2(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_copy_end
+
+repeat_three_standalone_emit_copy_short_2b:
+	LEAL -4(DX), DX
+	MOVW $0x0015, (AX)
+	MOVB DL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_standalone_emit_copy_short_2b:
+	SHLL $0x02, DX
+	ORL  $0x01, DX
+	MOVW DX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy_short_2b:
+	XORQ SI, SI
+	LEAL 1(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SARL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+long_offset_short_standalone:
+	MOVB $0xee, (AX)
+	MOVW CX, 1(AX)
+	LEAL -60(DX), DX
+	ADDQ $0x03, AX
+	ADDQ $0x03, BX
+
+	// emitRepeat
+emit_repeat_again_standalone_emit_copy_short:
+	MOVL DX, SI
+	LEAL -4(DX), DX
+	CMPL SI, $0x08
+	JBE  repeat_two_standalone_emit_copy_short
+	CMPL SI, $0x0c
+	JAE  cant_repeat_two_offset_standalone_emit_copy_short
+	CMPL CX, $0x00000800
+	JB   repeat_two_offset_standalone_emit_copy_short
+
+cant_repeat_two_offset_standalone_emit_copy_short:
+	CMPL DX, $0x00000104
+	JB   repeat_three_standalone_emit_copy_short
+	CMPL DX, $0x00010100
+	JB   repeat_four_standalone_emit_copy_short
+	CMPL DX, $0x0100ffff
+	JB   repeat_five_standalone_emit_copy_short
+	LEAL -16842747(DX), DX
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	ADDQ $0x05, BX
+	JMP  emit_repeat_again_standalone_emit_copy_short
+
+repeat_five_standalone_emit_copy_short:
+	LEAL -65536(DX), DX
+	MOVL DX, CX
+	MOVW $0x001d, (AX)
+	MOVW DX, 2(AX)
+	SARL $0x10, CX
+	MOVB CL, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_copy_end
+
+repeat_four_standalone_emit_copy_short:
+	LEAL -256(DX), DX
+	MOVW $0x0019, (AX)
+	MOVW DX, 2(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_copy_end
+
+repeat_three_standalone_emit_copy_short:
+	LEAL -4(DX), DX
+	MOVW $0x0015, (AX)
+	MOVB DL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_standalone_emit_copy_short:
+	SHLL $0x02, DX
+	ORL  $0x01, DX
+	MOVW DX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_offset_standalone_emit_copy_short:
+	XORQ SI, SI
+	LEAL 1(SI)(DX*4), DX
+	MOVB CL, 1(AX)
+	SARL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, DX
+	MOVB DL, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+two_byte_offset_short_standalone:
+	MOVL DX, SI
+	SHLL $0x02, SI
+	CMPL DX, $0x0c
+	JAE  emit_copy_three_standalone
+	CMPL CX, $0x00000800
+	JAE  emit_copy_three_standalone
+	LEAL -15(SI), SI
+	MOVB CL, 1(AX)
+	SHRL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+emit_copy_three_standalone:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW CX, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+
+gen_emit_copy_end:
+	MOVQ BX, ret+40(FP)
+	RET
+
+// func emitCopyNoRepeat(dst []byte, offset int, length int) int
+TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ offset+24(FP), CX
+	MOVQ length+32(FP), DX
+
+	// emitCopy
+	CMPL CX, $0x00010000
+	JB   two_byte_offset_standalone_snappy
+
+four_bytes_loop_back_standalone_snappy:
+	CMPL DX, $0x40
+	JBE  four_bytes_remain_standalone_snappy
+	MOVB $0xff, (AX)
+	MOVL CX, 1(AX)
+	LEAL -64(DX), DX
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	CMPL DX, $0x04
+	JB   four_bytes_remain_standalone_snappy
+	JMP  four_bytes_loop_back_standalone_snappy
+
+four_bytes_remain_standalone_snappy:
+	TESTL DX, DX
+	JZ    gen_emit_copy_end_snappy
+	XORL  SI, SI
+	LEAL  -1(SI)(DX*4), DX
+	MOVB  DL, (AX)
+	MOVL  CX, 1(AX)
+	ADDQ  $0x05, BX
+	ADDQ  $0x05, AX
+	JMP   gen_emit_copy_end_snappy
+
+two_byte_offset_standalone_snappy:
+	CMPL DX, $0x40
+	JBE  two_byte_offset_short_standalone_snappy
+	MOVB $0xee, (AX)
+	MOVW CX, 1(AX)
+	LEAL -60(DX), DX
+	ADDQ $0x03, AX
+	ADDQ $0x03, BX
+	JMP  two_byte_offset_standalone_snappy
+
+two_byte_offset_short_standalone_snappy:
+	MOVL DX, SI
+	SHLL $0x02, SI
+	CMPL DX, $0x0c
+	JAE  emit_copy_three_standalone_snappy
+	CMPL CX, $0x00000800
+	JAE  emit_copy_three_standalone_snappy
+	LEAL -15(SI), SI
+	MOVB CL, 1(AX)
+	SHRL $0x08, CX
+	SHLL $0x05, CX
+	ORL  CX, SI
+	MOVB SI, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end_snappy
+
+emit_copy_three_standalone_snappy:
+	LEAL -2(SI), SI
+	MOVB SI, (AX)
+	MOVW CX, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+
+gen_emit_copy_end_snappy:
+	MOVQ BX, ret+40(FP)
+	RET
+
+// func matchLen(a []byte, b []byte) int
+// Requires: BMI
+TEXT ·matchLen(SB), NOSPLIT, $0-56
+	MOVQ a_base+0(FP), AX
+	MOVQ b_base+24(FP), CX
+	MOVQ a_len+8(FP), DX
+
+	// matchLen
+	XORL SI, SI
+
+matchlen_loopback_16_standalone:
+	CMPL DX, $0x10
+	JB   matchlen_match8_standalone
+	MOVQ (AX)(SI*1), BX
+	MOVQ 8(AX)(SI*1), DI
+	XORQ (CX)(SI*1), BX
+	JNZ  matchlen_bsf_8_standalone
+	XORQ 8(CX)(SI*1), DI
+	JNZ  matchlen_bsf_16standalone
+	LEAL -16(DX), DX
+	LEAL 16(SI), SI
+	JMP  matchlen_loopback_16_standalone
+
+matchlen_bsf_16standalone:
+#ifdef GOAMD64_v3
+	TZCNTQ DI, DI
+
+#else
+	BSFQ DI, DI
+
+#endif
+	SARQ $0x03, DI
+	LEAL 8(SI)(DI*1), SI
+	JMP  gen_match_len_end
+
+matchlen_match8_standalone:
+	CMPL DX, $0x08
+	JB   matchlen_match4_standalone
+	MOVQ (AX)(SI*1), BX
+	XORQ (CX)(SI*1), BX
+	JNZ  matchlen_bsf_8_standalone
+	LEAL -8(DX), DX
+	LEAL 8(SI), SI
+	JMP  matchlen_match4_standalone
+
+matchlen_bsf_8_standalone:
+#ifdef GOAMD64_v3
+	TZCNTQ BX, BX
+
+#else
+	BSFQ BX, BX
+
+#endif
+	SARQ $0x03, BX
+	LEAL (SI)(BX*1), SI
+	JMP  gen_match_len_end
+
+matchlen_match4_standalone:
+	CMPL DX, $0x04
+	JB   matchlen_match2_standalone
+	MOVL (AX)(SI*1), BX
+	CMPL (CX)(SI*1), BX
+	JNE  matchlen_match2_standalone
+	LEAL -4(DX), DX
+	LEAL 4(SI), SI
+
+matchlen_match2_standalone:
+	CMPL DX, $0x01
+	JE   matchlen_match1_standalone
+	JB   gen_match_len_end
+	MOVW (AX)(SI*1), BX
+	CMPW (CX)(SI*1), BX
+	JNE  matchlen_match1_standalone
+	LEAL 2(SI), SI
+	SUBL $0x02, DX
+	JZ   gen_match_len_end
+
+matchlen_match1_standalone:
+	MOVB (AX)(SI*1), BL
+	CMPB (CX)(SI*1), BL
+	JNE  gen_match_len_end
+	LEAL 1(SI), SI
+
+gen_match_len_end:
+	MOVQ SI, ret+48(FP)
+	RET
+
+// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -8(AX)(CX*1), CX
+	XORQ DI, DI
+
+lz4_s2_loop:
+	CMPQ    DX, BX
+	JAE     lz4_s2_corrupt
+	CMPQ    AX, CX
+	JAE     lz4_s2_dstfull
+	MOVBQZX (DX), R8
+	MOVQ    R8, R9
+	MOVQ    R8, R10
+	SHRQ    $0x04, R9
+	ANDQ    $0x0f, R10
+	CMPQ    R8, $0xf0
+	JB      lz4_s2_ll_end
+
+lz4_s2_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4_s2_corrupt
+	MOVBQZX (DX), R8
+	ADDQ    R8, R9
+	CMPQ    R8, $0xff
+	JEQ     lz4_s2_ll_loop
+
+lz4_s2_ll_end:
+	LEAQ  (DX)(R9*1), R8
+	ADDQ  $0x04, R10
+	CMPQ  R8, BX
+	JAE   lz4_s2_corrupt
+	INCQ  DX
+	INCQ  R8
+	TESTQ R9, R9
+	JZ    lz4_s2_lits_done
+	LEAQ  (AX)(R9*1), R11
+	CMPQ  R11, CX
+	JAE   lz4_s2_dstfull
+	ADDQ  R9, SI
+	LEAL  -1(R9), R11
+	CMPL  R11, $0x3c
+	JB    one_byte_lz4_s2
+	CMPL  R11, $0x00000100
+	JB    two_bytes_lz4_s2
+	CMPL  R11, $0x00010000
+	JB    three_bytes_lz4_s2
+	CMPL  R11, $0x01000000
+	JB    four_bytes_lz4_s2
+	MOVB  $0xfc, (AX)
+	MOVL  R11, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   memmove_long_lz4_s2
+
+four_bytes_lz4_s2:
+	MOVL R11, R12
+	SHRL $0x10, R12
+	MOVB $0xf8, (AX)
+	MOVW R11, 1(AX)
+	MOVB R12, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_lz4_s2
+
+three_bytes_lz4_s2:
+	MOVB $0xf4, (AX)
+	MOVW R11, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_lz4_s2
+
+two_bytes_lz4_s2:
+	MOVB $0xf0, (AX)
+	MOVB R11, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R11, $0x40
+	JB   memmove_lz4_s2
+	JMP  memmove_long_lz4_s2
+
+one_byte_lz4_s2:
+	SHLB $0x02, R11
+	MOVB R11, (AX)
+	ADDQ $0x01, AX
+
+memmove_lz4_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_lz4_s2_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_lz4_s2_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_lz4_s2_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4_s2_memmove_move_33through64
+
+emit_lit_memmove_lz4_s2_memmove_move_8:
+	MOVQ (DX), R12
+	MOVQ R12, (AX)
+	JMP  memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_8through16:
+	MOVQ (DX), R12
+	MOVQ -8(DX)(R9*1), DX
+	MOVQ R12, (AX)
+	MOVQ DX, -8(AX)(R9*1)
+	JMP  memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_lz4_s2:
+	MOVQ R11, AX
+	JMP  lz4_s2_lits_emit_done
+
+memmove_long_lz4_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R12
+	ANDL  $0x0000001f, R12
+	MOVQ  $0x00000040, R14
+	SUBQ  R12, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
+	LEAQ  -32(DX)(R14*1), R12
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_lz4_s2large_big_loop_back:
+	MOVOU (R12), X4
+	MOVOU 16(R12), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_lz4_s2large_big_loop_back
+
+emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32:
+	MOVOU -32(DX)(R14*1), X4
+	MOVOU -16(DX)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R11, AX
+
+lz4_s2_lits_emit_done:
+	MOVQ R8, DX
+
+lz4_s2_lits_done:
+	CMPQ DX, BX
+	JNE  lz4_s2_match
+	CMPQ R10, $0x04
+	JEQ  lz4_s2_done
+	JMP  lz4_s2_corrupt
+
+lz4_s2_match:
+	LEAQ    2(DX), R8
+	CMPQ    R8, BX
+	JAE     lz4_s2_corrupt
+	MOVWQZX (DX), R9
+	MOVQ    R8, DX
+	TESTQ   R9, R9
+	JZ      lz4_s2_corrupt
+	CMPQ    R9, SI
+	JA      lz4_s2_corrupt
+	CMPQ    R10, $0x13
+	JNE     lz4_s2_ml_done
+
+lz4_s2_ml_loop:
+	MOVBQZX (DX), R8
+	INCQ    DX
+	ADDQ    R8, R10
+	CMPQ    DX, BX
+	JAE     lz4_s2_corrupt
+	CMPQ    R8, $0xff
+	JEQ     lz4_s2_ml_loop
+
+lz4_s2_ml_done:
+	ADDQ R10, SI
+	CMPQ R9, DI
+	JNE  lz4_s2_docopy
+
+	// emitRepeat
+emit_repeat_again_lz4_s2:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2
+
+cant_repeat_two_offset_lz4_s2:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2
+
+repeat_five_lz4_s2:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4_s2_loop
+
+repeat_four_lz4_s2:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_s2_loop
+
+repeat_three_lz4_s2:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+repeat_two_lz4_s2:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+repeat_two_offset_lz4_s2:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+lz4_s2_docopy:
+	MOVQ R9, DI
+
+	// emitCopy
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_lz4_s2
+	CMPL R9, $0x00000800
+	JAE  long_offset_short_lz4_s2
+	MOVL $0x00000001, R8
+	LEAL 16(R8), R8
+	MOVB R9, 1(AX)
+	MOVL R9, R11
+	SHRL $0x08, R11
+	SHLL $0x05, R11
+	ORL  R11, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R10
+
+	// emitRepeat
+	LEAL -4(R10), R10
+	JMP  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+emit_repeat_again_lz4_s2_emit_copy_short_2b:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2_emit_copy_short_2b
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2_emit_copy_short_2b
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2_emit_copy_short_2b
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2_emit_copy_short_2b
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2_emit_copy_short_2b
+
+repeat_five_lz4_s2_emit_copy_short_2b:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short_2b:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short_2b:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short_2b:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+long_offset_short_lz4_s2:
+	MOVB $0xee, (AX)
+	MOVW R9, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+
+	// emitRepeat
+emit_repeat_again_lz4_s2_emit_copy_short:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2_emit_copy_short
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2_emit_copy_short
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2_emit_copy_short
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2_emit_copy_short
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2_emit_copy_short
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2_emit_copy_short
+
+repeat_five_lz4_s2_emit_copy_short:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+two_byte_offset_short_lz4_s2:
+	MOVL R10, R8
+	SHLL $0x02, R8
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_lz4_s2
+	CMPL R9, $0x00000800
+	JAE  emit_copy_three_lz4_s2
+	LEAL -15(R8), R8
+	MOVB R9, 1(AX)
+	SHRL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+emit_copy_three_lz4_s2:
+	LEAL -2(R8), R8
+	MOVB R8, (AX)
+	MOVW R9, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+lz4_s2_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4_s2_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4_s2_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+// func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -8(AX)(CX*1), CX
+	XORQ DI, DI
+
+lz4s_s2_loop:
+	CMPQ    DX, BX
+	JAE     lz4s_s2_corrupt
+	CMPQ    AX, CX
+	JAE     lz4s_s2_dstfull
+	MOVBQZX (DX), R8
+	MOVQ    R8, R9
+	MOVQ    R8, R10
+	SHRQ    $0x04, R9
+	ANDQ    $0x0f, R10
+	CMPQ    R8, $0xf0
+	JB      lz4s_s2_ll_end
+
+lz4s_s2_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4s_s2_corrupt
+	MOVBQZX (DX), R8
+	ADDQ    R8, R9
+	CMPQ    R8, $0xff
+	JEQ     lz4s_s2_ll_loop
+
+lz4s_s2_ll_end:
+	LEAQ  (DX)(R9*1), R8
+	ADDQ  $0x03, R10
+	CMPQ  R8, BX
+	JAE   lz4s_s2_corrupt
+	INCQ  DX
+	INCQ  R8
+	TESTQ R9, R9
+	JZ    lz4s_s2_lits_done
+	LEAQ  (AX)(R9*1), R11
+	CMPQ  R11, CX
+	JAE   lz4s_s2_dstfull
+	ADDQ  R9, SI
+	LEAL  -1(R9), R11
+	CMPL  R11, $0x3c
+	JB    one_byte_lz4s_s2
+	CMPL  R11, $0x00000100
+	JB    two_bytes_lz4s_s2
+	CMPL  R11, $0x00010000
+	JB    three_bytes_lz4s_s2
+	CMPL  R11, $0x01000000
+	JB    four_bytes_lz4s_s2
+	MOVB  $0xfc, (AX)
+	MOVL  R11, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   memmove_long_lz4s_s2
+
+four_bytes_lz4s_s2:
+	MOVL R11, R12
+	SHRL $0x10, R12
+	MOVB $0xf8, (AX)
+	MOVW R11, 1(AX)
+	MOVB R12, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_lz4s_s2
+
+three_bytes_lz4s_s2:
+	MOVB $0xf4, (AX)
+	MOVW R11, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_lz4s_s2
+
+two_bytes_lz4s_s2:
+	MOVB $0xf0, (AX)
+	MOVB R11, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R11, $0x40
+	JB   memmove_lz4s_s2
+	JMP  memmove_long_lz4s_s2
+
+one_byte_lz4s_s2:
+	SHLB $0x02, R11
+	MOVB R11, (AX)
+	ADDQ $0x01, AX
+
+memmove_lz4s_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JBE  emit_lit_memmove_lz4s_s2_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_lz4s_s2_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_lz4s_s2_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4s_s2_memmove_move_33through64
+
+emit_lit_memmove_lz4s_s2_memmove_move_8:
+	MOVQ (DX), R12
+	MOVQ R12, (AX)
+	JMP  memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_8through16:
+	MOVQ (DX), R12
+	MOVQ -8(DX)(R9*1), DX
+	MOVQ R12, (AX)
+	MOVQ DX, -8(AX)(R9*1)
+	JMP  memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_lz4s_s2:
+	MOVQ R11, AX
+	JMP  lz4s_s2_lits_emit_done
+
+memmove_long_lz4s_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R12
+	ANDL  $0x0000001f, R12
+	MOVQ  $0x00000040, R14
+	SUBQ  R12, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
+	LEAQ  -32(DX)(R14*1), R12
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_lz4s_s2large_big_loop_back:
+	MOVOU (R12), X4
+	MOVOU 16(R12), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_lz4s_s2large_big_loop_back
+
+emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32:
+	MOVOU -32(DX)(R14*1), X4
+	MOVOU -16(DX)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R11, AX
+
+lz4s_s2_lits_emit_done:
+	MOVQ R8, DX
+
+lz4s_s2_lits_done:
+	CMPQ DX, BX
+	JNE  lz4s_s2_match
+	CMPQ R10, $0x03
+	JEQ  lz4s_s2_done
+	JMP  lz4s_s2_corrupt
+
+lz4s_s2_match:
+	CMPQ    R10, $0x03
+	JEQ     lz4s_s2_loop
+	LEAQ    2(DX), R8
+	CMPQ    R8, BX
+	JAE     lz4s_s2_corrupt
+	MOVWQZX (DX), R9
+	MOVQ    R8, DX
+	TESTQ   R9, R9
+	JZ      lz4s_s2_corrupt
+	CMPQ    R9, SI
+	JA      lz4s_s2_corrupt
+	CMPQ    R10, $0x12
+	JNE     lz4s_s2_ml_done
+
+lz4s_s2_ml_loop:
+	MOVBQZX (DX), R8
+	INCQ    DX
+	ADDQ    R8, R10
+	CMPQ    DX, BX
+	JAE     lz4s_s2_corrupt
+	CMPQ    R8, $0xff
+	JEQ     lz4s_s2_ml_loop
+
+lz4s_s2_ml_done:
+	ADDQ R10, SI
+	CMPQ R9, DI
+	JNE  lz4s_s2_docopy
+
+	// emitRepeat
+emit_repeat_again_lz4_s2:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2
+
+cant_repeat_two_offset_lz4_s2:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2
+
+repeat_five_lz4_s2:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4s_s2_loop
+
+repeat_four_lz4_s2:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4s_s2_loop
+
+repeat_three_lz4_s2:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_lz4_s2:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_offset_lz4_s2:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+lz4s_s2_docopy:
+	MOVQ R9, DI
+
+	// emitCopy
+	CMPL R10, $0x40
+	JBE  two_byte_offset_short_lz4_s2
+	CMPL R9, $0x00000800
+	JAE  long_offset_short_lz4_s2
+	MOVL $0x00000001, R8
+	LEAL 16(R8), R8
+	MOVB R9, 1(AX)
+	MOVL R9, R11
+	SHRL $0x08, R11
+	SHLL $0x05, R11
+	ORL  R11, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R10
+
+	// emitRepeat
+	LEAL -4(R10), R10
+	JMP  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+emit_repeat_again_lz4_s2_emit_copy_short_2b:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2_emit_copy_short_2b
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2_emit_copy_short_2b
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2_emit_copy_short_2b
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2_emit_copy_short_2b
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2_emit_copy_short_2b
+
+repeat_five_lz4_s2_emit_copy_short_2b:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4s_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short_2b:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4s_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short_2b:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short_2b:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+long_offset_short_lz4_s2:
+	MOVB $0xee, (AX)
+	MOVW R9, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+
+	// emitRepeat
+emit_repeat_again_lz4_s2_emit_copy_short:
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JBE  repeat_two_lz4_s2_emit_copy_short
+	CMPL R8, $0x0c
+	JAE  cant_repeat_two_offset_lz4_s2_emit_copy_short
+	CMPL R9, $0x00000800
+	JB   repeat_two_offset_lz4_s2_emit_copy_short
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short:
+	CMPL R10, $0x00000104
+	JB   repeat_three_lz4_s2_emit_copy_short
+	CMPL R10, $0x00010100
+	JB   repeat_four_lz4_s2_emit_copy_short
+	CMPL R10, $0x0100ffff
+	JB   repeat_five_lz4_s2_emit_copy_short
+	LEAL -16842747(R10), R10
+	MOVL $0xfffb001d, (AX)
+	MOVB $0xff, 4(AX)
+	ADDQ $0x05, AX
+	JMP  emit_repeat_again_lz4_s2_emit_copy_short
+
+repeat_five_lz4_s2_emit_copy_short:
+	LEAL -65536(R10), R10
+	MOVL R10, R9
+	MOVW $0x001d, (AX)
+	MOVW R10, 2(AX)
+	SARL $0x10, R9
+	MOVB R9, 4(AX)
+	ADDQ $0x05, AX
+	JMP  lz4s_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short:
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4s_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+two_byte_offset_short_lz4_s2:
+	MOVL R10, R8
+	SHLL $0x02, R8
+	CMPL R10, $0x0c
+	JAE  emit_copy_three_lz4_s2
+	CMPL R9, $0x00000800
+	JAE  emit_copy_three_lz4_s2
+	LEAL -15(R8), R8
+	MOVB R9, 1(AX)
+	SHRL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_s2_loop
+
+emit_copy_three_lz4_s2:
+	LEAL -2(R8), R8
+	MOVB R8, (AX)
+	MOVW R9, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_s2_loop
+
+lz4s_s2_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4s_s2_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4s_s2_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+// func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -8(AX)(CX*1), CX
+
+lz4_snappy_loop:
+	CMPQ    DX, BX
+	JAE     lz4_snappy_corrupt
+	CMPQ    AX, CX
+	JAE     lz4_snappy_dstfull
+	MOVBQZX (DX), DI
+	MOVQ    DI, R8
+	MOVQ    DI, R9
+	SHRQ    $0x04, R8
+	ANDQ    $0x0f, R9
+	CMPQ    DI, $0xf0
+	JB      lz4_snappy_ll_end
+
+lz4_snappy_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4_snappy_corrupt
+	MOVBQZX (DX), DI
+	ADDQ    DI, R8
+	CMPQ    DI, $0xff
+	JEQ     lz4_snappy_ll_loop
+
+lz4_snappy_ll_end:
+	LEAQ  (DX)(R8*1), DI
+	ADDQ  $0x04, R9
+	CMPQ  DI, BX
+	JAE   lz4_snappy_corrupt
+	INCQ  DX
+	INCQ  DI
+	TESTQ R8, R8
+	JZ    lz4_snappy_lits_done
+	LEAQ  (AX)(R8*1), R10
+	CMPQ  R10, CX
+	JAE   lz4_snappy_dstfull
+	ADDQ  R8, SI
+	LEAL  -1(R8), R10
+	CMPL  R10, $0x3c
+	JB    one_byte_lz4_snappy
+	CMPL  R10, $0x00000100
+	JB    two_bytes_lz4_snappy
+	CMPL  R10, $0x00010000
+	JB    three_bytes_lz4_snappy
+	CMPL  R10, $0x01000000
+	JB    four_bytes_lz4_snappy
+	MOVB  $0xfc, (AX)
+	MOVL  R10, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   memmove_long_lz4_snappy
+
+four_bytes_lz4_snappy:
+	MOVL R10, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW R10, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_lz4_snappy
+
+three_bytes_lz4_snappy:
+	MOVB $0xf4, (AX)
+	MOVW R10, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_lz4_snappy
+
+two_bytes_lz4_snappy:
+	MOVB $0xf0, (AX)
+	MOVB R10, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R10, $0x40
+	JB   memmove_lz4_snappy
+	JMP  memmove_long_lz4_snappy
+
+one_byte_lz4_snappy:
+	SHLB $0x02, R10
+	MOVB R10, (AX)
+	ADDQ $0x01, AX
+
+memmove_lz4_snappy:
+	LEAQ (AX)(R8*1), R10
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_lz4_snappy_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_lz4_snappy_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_lz4_snappy_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4_snappy_memmove_move_33through64
+
+emit_lit_memmove_lz4_snappy_memmove_move_8:
+	MOVQ (DX), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_8through16:
+	MOVQ (DX), R11
+	MOVQ -8(DX)(R8*1), DX
+	MOVQ R11, (AX)
+	MOVQ DX, -8(AX)(R8*1)
+	JMP  memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R8*1), X2
+	MOVOU -16(DX)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_lz4_snappy:
+	MOVQ R10, AX
+	JMP  lz4_snappy_lits_emit_done
+
+memmove_long_lz4_snappy:
+	LEAQ (AX)(R8*1), R10
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R8*1), X2
+	MOVOU -16(DX)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R13*1), R11
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_lz4_snappylarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_lz4_snappylarge_big_loop_back
+
+emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R13*1), X4
+	MOVOU -16(DX)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  R10, AX
+
+lz4_snappy_lits_emit_done:
+	MOVQ DI, DX
+
+lz4_snappy_lits_done:
+	CMPQ DX, BX
+	JNE  lz4_snappy_match
+	CMPQ R9, $0x04
+	JEQ  lz4_snappy_done
+	JMP  lz4_snappy_corrupt
+
+lz4_snappy_match:
+	LEAQ    2(DX), DI
+	CMPQ    DI, BX
+	JAE     lz4_snappy_corrupt
+	MOVWQZX (DX), R8
+	MOVQ    DI, DX
+	TESTQ   R8, R8
+	JZ      lz4_snappy_corrupt
+	CMPQ    R8, SI
+	JA      lz4_snappy_corrupt
+	CMPQ    R9, $0x13
+	JNE     lz4_snappy_ml_done
+
+lz4_snappy_ml_loop:
+	MOVBQZX (DX), DI
+	INCQ    DX
+	ADDQ    DI, R9
+	CMPQ    DX, BX
+	JAE     lz4_snappy_corrupt
+	CMPQ    DI, $0xff
+	JEQ     lz4_snappy_ml_loop
+
+lz4_snappy_ml_done:
+	ADDQ R9, SI
+
+	// emitCopy
+two_byte_offset_lz4_s2:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_lz4_s2
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	CMPQ AX, CX
+	JAE  lz4_snappy_loop
+	JMP  two_byte_offset_lz4_s2
+
+two_byte_offset_short_lz4_s2:
+	MOVL R9, DI
+	SHLL $0x02, DI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_lz4_s2
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_lz4_s2
+	LEAL -15(DI), DI
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_snappy_loop
+
+emit_copy_three_lz4_s2:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_snappy_loop
+
+lz4_snappy_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4_snappy_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4_snappy_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+// func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -8(AX)(CX*1), CX
+
+lz4s_snappy_loop:
+	CMPQ    DX, BX
+	JAE     lz4s_snappy_corrupt
+	CMPQ    AX, CX
+	JAE     lz4s_snappy_dstfull
+	MOVBQZX (DX), DI
+	MOVQ    DI, R8
+	MOVQ    DI, R9
+	SHRQ    $0x04, R8
+	ANDQ    $0x0f, R9
+	CMPQ    DI, $0xf0
+	JB      lz4s_snappy_ll_end
+
+lz4s_snappy_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4s_snappy_corrupt
+	MOVBQZX (DX), DI
+	ADDQ    DI, R8
+	CMPQ    DI, $0xff
+	JEQ     lz4s_snappy_ll_loop
+
+lz4s_snappy_ll_end:
+	LEAQ  (DX)(R8*1), DI
+	ADDQ  $0x03, R9
+	CMPQ  DI, BX
+	JAE   lz4s_snappy_corrupt
+	INCQ  DX
+	INCQ  DI
+	TESTQ R8, R8
+	JZ    lz4s_snappy_lits_done
+	LEAQ  (AX)(R8*1), R10
+	CMPQ  R10, CX
+	JAE   lz4s_snappy_dstfull
+	ADDQ  R8, SI
+	LEAL  -1(R8), R10
+	CMPL  R10, $0x3c
+	JB    one_byte_lz4s_snappy
+	CMPL  R10, $0x00000100
+	JB    two_bytes_lz4s_snappy
+	CMPL  R10, $0x00010000
+	JB    three_bytes_lz4s_snappy
+	CMPL  R10, $0x01000000
+	JB    four_bytes_lz4s_snappy
+	MOVB  $0xfc, (AX)
+	MOVL  R10, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   memmove_long_lz4s_snappy
+
+four_bytes_lz4s_snappy:
+	MOVL R10, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (AX)
+	MOVW R10, 1(AX)
+	MOVB R11, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_lz4s_snappy
+
+three_bytes_lz4s_snappy:
+	MOVB $0xf4, (AX)
+	MOVW R10, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_lz4s_snappy
+
+two_bytes_lz4s_snappy:
+	MOVB $0xf0, (AX)
+	MOVB R10, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R10, $0x40
+	JB   memmove_lz4s_snappy
+	JMP  memmove_long_lz4s_snappy
+
+one_byte_lz4s_snappy:
+	SHLB $0x02, R10
+	MOVB R10, (AX)
+	ADDQ $0x01, AX
+
+memmove_lz4s_snappy:
+	LEAQ (AX)(R8*1), R10
+
+	// genMemMoveShort
+	CMPQ R8, $0x08
+	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_8
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_lz4s_snappy_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4s_snappy_memmove_move_33through64
+
+emit_lit_memmove_lz4s_snappy_memmove_move_8:
+	MOVQ (DX), R11
+	MOVQ R11, (AX)
+	JMP  memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_8through16:
+	MOVQ (DX), R11
+	MOVQ -8(DX)(R8*1), DX
+	MOVQ R11, (AX)
+	MOVQ DX, -8(AX)(R8*1)
+	JMP  memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R8*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R8*1)
+	JMP   memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R8*1), X2
+	MOVOU -16(DX)(R8*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_lz4s_snappy:
+	MOVQ R10, AX
+	JMP  lz4s_snappy_lits_emit_done
+
+memmove_long_lz4s_snappy:
+	LEAQ (AX)(R8*1), R10
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R8*1), X2
+	MOVOU -16(DX)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  AX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R13*1), R11
+	LEAQ  -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_lz4s_snappylarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_lz4s_snappylarge_big_loop_back
+
+emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R13*1), X4
+	MOVOU -16(DX)(R13*1), X5
+	MOVOA X4, -32(AX)(R13*1)
+	MOVOA X5, -16(AX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R8*1)
+	MOVOU X3, -16(AX)(R8*1)
+	MOVQ  R10, AX
+
+lz4s_snappy_lits_emit_done:
+	MOVQ DI, DX
+
+lz4s_snappy_lits_done:
+	CMPQ DX, BX
+	JNE  lz4s_snappy_match
+	CMPQ R9, $0x03
+	JEQ  lz4s_snappy_done
+	JMP  lz4s_snappy_corrupt
+
+lz4s_snappy_match:
+	CMPQ    R9, $0x03
+	JEQ     lz4s_snappy_loop
+	LEAQ    2(DX), DI
+	CMPQ    DI, BX
+	JAE     lz4s_snappy_corrupt
+	MOVWQZX (DX), R8
+	MOVQ    DI, DX
+	TESTQ   R8, R8
+	JZ      lz4s_snappy_corrupt
+	CMPQ    R8, SI
+	JA      lz4s_snappy_corrupt
+	CMPQ    R9, $0x12
+	JNE     lz4s_snappy_ml_done
+
+lz4s_snappy_ml_loop:
+	MOVBQZX (DX), DI
+	INCQ    DX
+	ADDQ    DI, R9
+	CMPQ    DX, BX
+	JAE     lz4s_snappy_corrupt
+	CMPQ    DI, $0xff
+	JEQ     lz4s_snappy_ml_loop
+
+lz4s_snappy_ml_done:
+	ADDQ R9, SI
+
+	// emitCopy
+two_byte_offset_lz4_s2:
+	CMPL R9, $0x40
+	JBE  two_byte_offset_short_lz4_s2
+	MOVB $0xee, (AX)
+	MOVW R8, 1(AX)
+	LEAL -60(R9), R9
+	ADDQ $0x03, AX
+	CMPQ AX, CX
+	JAE  lz4s_snappy_loop
+	JMP  two_byte_offset_lz4_s2
+
+two_byte_offset_short_lz4_s2:
+	MOVL R9, DI
+	SHLL $0x02, DI
+	CMPL R9, $0x0c
+	JAE  emit_copy_three_lz4_s2
+	CMPL R8, $0x00000800
+	JAE  emit_copy_three_lz4_s2
+	LEAL -15(DI), DI
+	MOVB R8, 1(AX)
+	SHRL $0x08, R8
+	SHLL $0x05, R8
+	ORL  R8, DI
+	MOVB DI, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4s_snappy_loop
+
+emit_copy_three_lz4_s2:
+	LEAL -2(DI), DI
+	MOVB DI, (AX)
+	MOVW R8, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4s_snappy_loop
+
+lz4s_snappy_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4s_snappy_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4s_snappy_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/s2/index.go b/vendor/github.com/klauspost/compress/s2/index.go
new file mode 100644
index 0000000000..4229957b96
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/index.go
@@ -0,0 +1,602 @@
+// Copyright (c) 2022+ Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"bytes"
+	"encoding/binary"
+	"encoding/json"
+	"fmt"
+	"io"
+	"sort"
+)
+
+const (
+	S2IndexHeader   = "s2idx\x00"
+	S2IndexTrailer  = "\x00xdi2s"
+	maxIndexEntries = 1 << 16
+	// If distance is less than this, we do not add the entry.
+	minIndexDist = 1 << 20
+)
+
+// Index represents an S2/Snappy index.
+type Index struct {
+	TotalUncompressed int64 // Total Uncompressed size if known. Will be -1 if unknown.
+	TotalCompressed   int64 // Total Compressed size if known. Will be -1 if unknown.
+	info              []struct {
+		compressedOffset   int64
+		uncompressedOffset int64
+	}
+	estBlockUncomp int64
+}
+
+func (i *Index) reset(maxBlock int) {
+	i.estBlockUncomp = int64(maxBlock)
+	i.TotalCompressed = -1
+	i.TotalUncompressed = -1
+	if len(i.info) > 0 {
+		i.info = i.info[:0]
+	}
+}
+
+// allocInfos will allocate an empty slice of infos.
+func (i *Index) allocInfos(n int) {
+	if n > maxIndexEntries {
+		panic("n > maxIndexEntries")
+	}
+	i.info = make([]struct {
+		compressedOffset   int64
+		uncompressedOffset int64
+	}, 0, n)
+}
+
+// add an uncompressed and compressed pair.
+// Entries must be sent in order.
+func (i *Index) add(compressedOffset, uncompressedOffset int64) error {
+	if i == nil {
+		return nil
+	}
+	lastIdx := len(i.info) - 1
+	if lastIdx >= 0 {
+		latest := i.info[lastIdx]
+		if latest.uncompressedOffset == uncompressedOffset {
+			// Uncompressed didn't change, don't add entry,
+			// but update start index.
+			latest.compressedOffset = compressedOffset
+			i.info[lastIdx] = latest
+			return nil
+		}
+		if latest.uncompressedOffset > uncompressedOffset {
+			return fmt.Errorf("internal error: Earlier uncompressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
+		}
+		if latest.compressedOffset > compressedOffset {
+			return fmt.Errorf("internal error: Earlier compressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
+		}
+		if latest.uncompressedOffset+minIndexDist > uncompressedOffset {
+			// Only add entry if distance is large enough.
+			return nil
+		}
+	}
+	i.info = append(i.info, struct {
+		compressedOffset   int64
+		uncompressedOffset int64
+	}{compressedOffset: compressedOffset, uncompressedOffset: uncompressedOffset})
+	return nil
+}
+
+// Find the offset at or before the wanted (uncompressed) offset.
+// If offset is 0 or positive it is the offset from the beginning of the file.
+// If the uncompressed size is known, the offset must be within the file.
+// If an offset outside the file is requested io.ErrUnexpectedEOF is returned.
+// If the offset is negative, it is interpreted as the distance from the end of the file,
+// where -1 represents the last byte.
+// If offset from the end of the file is requested, but size is unknown,
+// ErrUnsupported will be returned.
+func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err error) {
+	if i.TotalUncompressed < 0 {
+		return 0, 0, ErrCorrupt
+	}
+	if offset < 0 {
+		offset = i.TotalUncompressed + offset
+		if offset < 0 {
+			return 0, 0, io.ErrUnexpectedEOF
+		}
+	}
+	if offset > i.TotalUncompressed {
+		return 0, 0, io.ErrUnexpectedEOF
+	}
+	if len(i.info) > 200 {
+		n := sort.Search(len(i.info), func(n int) bool {
+			return i.info[n].uncompressedOffset > offset
+		})
+		if n == 0 {
+			n = 1
+		}
+		return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil
+	}
+	for _, info := range i.info {
+		if info.uncompressedOffset > offset {
+			break
+		}
+		compressedOff = info.compressedOffset
+		uncompressedOff = info.uncompressedOffset
+	}
+	return compressedOff, uncompressedOff, nil
+}
+
+// reduce to stay below maxIndexEntries
+func (i *Index) reduce() {
+	if len(i.info) < maxIndexEntries && i.estBlockUncomp >= minIndexDist {
+		return
+	}
+
+	// Algorithm, keep 1, remove removeN entries...
+	removeN := (len(i.info) + 1) / maxIndexEntries
+	src := i.info
+	j := 0
+
+	// Each block should be at least 1MB, but don't reduce below 1000 entries.
+	for i.estBlockUncomp*(int64(removeN)+1) < minIndexDist && len(i.info)/(removeN+1) > 1000 {
+		removeN++
+	}
+	for idx := 0; idx < len(src); idx++ {
+		i.info[j] = src[idx]
+		j++
+		idx += removeN
+	}
+	i.info = i.info[:j]
+	// Update maxblock estimate.
+	i.estBlockUncomp += i.estBlockUncomp * int64(removeN)
+}
+
+func (i *Index) appendTo(b []byte, uncompTotal, compTotal int64) []byte {
+	i.reduce()
+	var tmp [binary.MaxVarintLen64]byte
+
+	initSize := len(b)
+	// We make the start a skippable header+size.
+	b = append(b, ChunkTypeIndex, 0, 0, 0)
+	b = append(b, []byte(S2IndexHeader)...)
+	// Total Uncompressed size
+	n := binary.PutVarint(tmp[:], uncompTotal)
+	b = append(b, tmp[:n]...)
+	// Total Compressed size
+	n = binary.PutVarint(tmp[:], compTotal)
+	b = append(b, tmp[:n]...)
+	// Put EstBlockUncomp size
+	n = binary.PutVarint(tmp[:], i.estBlockUncomp)
+	b = append(b, tmp[:n]...)
+	// Put length
+	n = binary.PutVarint(tmp[:], int64(len(i.info)))
+	b = append(b, tmp[:n]...)
+
+	// Check if we should add uncompressed offsets
+	var hasUncompressed byte
+	for idx, info := range i.info {
+		if idx == 0 {
+			if info.uncompressedOffset != 0 {
+				hasUncompressed = 1
+				break
+			}
+			continue
+		}
+		if info.uncompressedOffset != i.info[idx-1].uncompressedOffset+i.estBlockUncomp {
+			hasUncompressed = 1
+			break
+		}
+	}
+	b = append(b, hasUncompressed)
+
+	// Add each entry
+	if hasUncompressed == 1 {
+		for idx, info := range i.info {
+			uOff := info.uncompressedOffset
+			if idx > 0 {
+				prev := i.info[idx-1]
+				uOff -= prev.uncompressedOffset + (i.estBlockUncomp)
+			}
+			n = binary.PutVarint(tmp[:], uOff)
+			b = append(b, tmp[:n]...)
+		}
+	}
+
+	// Initial compressed size estimate.
+	cPredict := i.estBlockUncomp / 2
+
+	for idx, info := range i.info {
+		cOff := info.compressedOffset
+		if idx > 0 {
+			prev := i.info[idx-1]
+			cOff -= prev.compressedOffset + cPredict
+			// Update compressed size prediction, with half the error.
+			cPredict += cOff / 2
+		}
+		n = binary.PutVarint(tmp[:], cOff)
+		b = append(b, tmp[:n]...)
+	}
+
+	// Add Total Size.
+	// Stored as fixed size for easier reading.
+	binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)-initSize+4+len(S2IndexTrailer)))
+	b = append(b, tmp[:4]...)
+	// Trailer
+	b = append(b, []byte(S2IndexTrailer)...)
+
+	// Update size
+	chunkLen := len(b) - initSize - skippableFrameHeader
+	b[initSize+1] = uint8(chunkLen >> 0)
+	b[initSize+2] = uint8(chunkLen >> 8)
+	b[initSize+3] = uint8(chunkLen >> 16)
+	//fmt.Printf("chunklen: 0x%x Uncomp:%d, Comp:%d\n", chunkLen, uncompTotal, compTotal)
+	return b
+}
+
+// Load a binary index.
+// A zero value Index can be used or a previous one can be reused.
+func (i *Index) Load(b []byte) ([]byte, error) {
+	if len(b) <= 4+len(S2IndexHeader)+len(S2IndexTrailer) {
+		return b, io.ErrUnexpectedEOF
+	}
+	if b[0] != ChunkTypeIndex {
+		return b, ErrCorrupt
+	}
+	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
+	b = b[4:]
+
+	// Validate we have enough...
+	if len(b) < chunkLen {
+		return b, io.ErrUnexpectedEOF
+	}
+	if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
+		return b, ErrUnsupported
+	}
+	b = b[len(S2IndexHeader):]
+
+	// Total Uncompressed
+	if v, n := binary.Varint(b); n <= 0 || v < 0 {
+		return b, ErrCorrupt
+	} else {
+		i.TotalUncompressed = v
+		b = b[n:]
+	}
+
+	// Total Compressed
+	if v, n := binary.Varint(b); n <= 0 {
+		return b, ErrCorrupt
+	} else {
+		i.TotalCompressed = v
+		b = b[n:]
+	}
+
+	// Read EstBlockUncomp
+	if v, n := binary.Varint(b); n <= 0 {
+		return b, ErrCorrupt
+	} else {
+		if v < 0 {
+			return b, ErrCorrupt
+		}
+		i.estBlockUncomp = v
+		b = b[n:]
+	}
+
+	var entries int
+	if v, n := binary.Varint(b); n <= 0 {
+		return b, ErrCorrupt
+	} else {
+		if v < 0 || v > maxIndexEntries {
+			return b, ErrCorrupt
+		}
+		entries = int(v)
+		b = b[n:]
+	}
+	if cap(i.info) < entries {
+		i.allocInfos(entries)
+	}
+	i.info = i.info[:entries]
+
+	if len(b) < 1 {
+		return b, io.ErrUnexpectedEOF
+	}
+	hasUncompressed := b[0]
+	b = b[1:]
+	if hasUncompressed&1 != hasUncompressed {
+		return b, ErrCorrupt
+	}
+
+	// Add each uncompressed entry
+	for idx := range i.info {
+		var uOff int64
+		if hasUncompressed != 0 {
+			// Load delta
+			if v, n := binary.Varint(b); n <= 0 {
+				return b, ErrCorrupt
+			} else {
+				uOff = v
+				b = b[n:]
+			}
+		}
+
+		if idx > 0 {
+			prev := i.info[idx-1].uncompressedOffset
+			uOff += prev + (i.estBlockUncomp)
+			if uOff <= prev {
+				return b, ErrCorrupt
+			}
+		}
+		if uOff < 0 {
+			return b, ErrCorrupt
+		}
+		i.info[idx].uncompressedOffset = uOff
+	}
+
+	// Initial compressed size estimate.
+	cPredict := i.estBlockUncomp / 2
+
+	// Add each compressed entry
+	for idx := range i.info {
+		var cOff int64
+		if v, n := binary.Varint(b); n <= 0 {
+			return b, ErrCorrupt
+		} else {
+			cOff = v
+			b = b[n:]
+		}
+
+		if idx > 0 {
+			// Update compressed size prediction, with half the error.
+			cPredictNew := cPredict + cOff/2
+
+			prev := i.info[idx-1].compressedOffset
+			cOff += prev + cPredict
+			if cOff <= prev {
+				return b, ErrCorrupt
+			}
+			cPredict = cPredictNew
+		}
+		if cOff < 0 {
+			return b, ErrCorrupt
+		}
+		i.info[idx].compressedOffset = cOff
+	}
+	if len(b) < 4+len(S2IndexTrailer) {
+		return b, io.ErrUnexpectedEOF
+	}
+	// Skip size...
+	b = b[4:]
+
+	// Check trailer...
+	if !bytes.Equal(b[:len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
+		return b, ErrCorrupt
+	}
+	return b[len(S2IndexTrailer):], nil
+}
+
+// LoadStream will load an index from the end of the supplied stream.
+// ErrUnsupported will be returned if the signature cannot be found.
+// ErrCorrupt will be returned if unexpected values are found.
+// io.ErrUnexpectedEOF is returned if there are too few bytes.
+// IO errors are returned as-is.
+func (i *Index) LoadStream(rs io.ReadSeeker) error {
+	// Go to end.
+	_, err := rs.Seek(-10, io.SeekEnd)
+	if err != nil {
+		return err
+	}
+	var tmp [10]byte
+	_, err = io.ReadFull(rs, tmp[:])
+	if err != nil {
+		return err
+	}
+	// Check trailer...
+	if !bytes.Equal(tmp[4:4+len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
+		return ErrUnsupported
+	}
+	sz := binary.LittleEndian.Uint32(tmp[:4])
+	if sz > maxChunkSize+skippableFrameHeader {
+		return ErrCorrupt
+	}
+	_, err = rs.Seek(-int64(sz), io.SeekEnd)
+	if err != nil {
+		return err
+	}
+
+	// Read index.
+	buf := make([]byte, sz)
+	_, err = io.ReadFull(rs, buf)
+	if err != nil {
+		return err
+	}
+	_, err = i.Load(buf)
+	return err
+}
+
+// IndexStream will return an index for a stream.
+// The stream structure will be checked, but
+// data within blocks is not verified.
+// The returned index can either be appended to the end of the stream
+// or stored separately.
+func IndexStream(r io.Reader) ([]byte, error) {
+	var i Index
+	var buf [maxChunkSize]byte
+	var readHeader bool
+	for {
+		_, err := io.ReadFull(r, buf[:4])
+		if err != nil {
+			if err == io.EOF {
+				return i.appendTo(nil, i.TotalUncompressed, i.TotalCompressed), nil
+			}
+			return nil, err
+		}
+		// Start of this chunk.
+		startChunk := i.TotalCompressed
+		i.TotalCompressed += 4
+
+		chunkType := buf[0]
+		if !readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				return nil, ErrCorrupt
+			}
+			readHeader = true
+		}
+		chunkLen := int(buf[1]) | int(buf[2])<<8 | int(buf[3])<<16
+		if chunkLen < checksumSize {
+			return nil, ErrCorrupt
+		}
+
+		i.TotalCompressed += int64(chunkLen)
+		_, err = io.ReadFull(r, buf[:chunkLen])
+		if err != nil {
+			return nil, io.ErrUnexpectedEOF
+		}
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			// Section 4.2. Compressed data (chunk type 0x00).
+			// Skip checksum.
+			dLen, err := DecodedLen(buf[checksumSize:])
+			if err != nil {
+				return nil, err
+			}
+			if dLen > maxBlockSize {
+				return nil, ErrCorrupt
+			}
+			if i.estBlockUncomp == 0 {
+				// Use first block for estimate...
+				i.estBlockUncomp = int64(dLen)
+			}
+			err = i.add(startChunk, i.TotalUncompressed)
+			if err != nil {
+				return nil, err
+			}
+			i.TotalUncompressed += int64(dLen)
+			continue
+		case chunkTypeUncompressedData:
+			n2 := chunkLen - checksumSize
+			if n2 > maxBlockSize {
+				return nil, ErrCorrupt
+			}
+			if i.estBlockUncomp == 0 {
+				// Use first block for estimate...
+				i.estBlockUncomp = int64(n2)
+			}
+			err = i.add(startChunk, i.TotalUncompressed)
+			if err != nil {
+				return nil, err
+			}
+			i.TotalUncompressed += int64(n2)
+			continue
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				return nil, ErrCorrupt
+			}
+
+			if string(buf[:len(magicBody)]) != magicBody {
+				if string(buf[:len(magicBody)]) != magicBodySnappy {
+					return nil, ErrCorrupt
+				}
+			}
+
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			return nil, ErrUnsupported
+		}
+		if chunkLen > maxChunkSize {
+			return nil, ErrUnsupported
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+	}
+}
+
+// JSON returns the index as JSON text.
+func (i *Index) JSON() []byte {
+	type offset struct {
+		CompressedOffset   int64 `json:"compressed"`
+		UncompressedOffset int64 `json:"uncompressed"`
+	}
+	x := struct {
+		TotalUncompressed int64    `json:"total_uncompressed"` // Total Uncompressed size if known. Will be -1 if unknown.
+		TotalCompressed   int64    `json:"total_compressed"`   // Total Compressed size if known. Will be -1 if unknown.
+		Offsets           []offset `json:"offsets"`
+		EstBlockUncomp    int64    `json:"est_block_uncompressed"`
+	}{
+		TotalUncompressed: i.TotalUncompressed,
+		TotalCompressed:   i.TotalCompressed,
+		EstBlockUncomp:    i.estBlockUncomp,
+	}
+	for _, v := range i.info {
+		x.Offsets = append(x.Offsets, offset{CompressedOffset: v.compressedOffset, UncompressedOffset: v.uncompressedOffset})
+	}
+	b, _ := json.MarshalIndent(x, "", "  ")
+	return b
+}
+
+// RemoveIndexHeaders will trim all headers and trailers from a given index.
+// This is expected to save 20 bytes.
+// These can be restored using RestoreIndexHeaders.
+// This removes a layer of security, but is the most compact representation.
+// Returns nil if headers contains errors.
+// The returned slice references the provided slice.
+func RemoveIndexHeaders(b []byte) []byte {
+	const save = 4 + len(S2IndexHeader) + len(S2IndexTrailer) + 4
+	if len(b) <= save {
+		return nil
+	}
+	if b[0] != ChunkTypeIndex {
+		return nil
+	}
+	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
+	b = b[4:]
+
+	// Validate we have enough...
+	if len(b) < chunkLen {
+		return nil
+	}
+	b = b[:chunkLen]
+
+	if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
+		return nil
+	}
+	b = b[len(S2IndexHeader):]
+	if !bytes.HasSuffix(b, []byte(S2IndexTrailer)) {
+		return nil
+	}
+	b = bytes.TrimSuffix(b, []byte(S2IndexTrailer))
+
+	if len(b) < 4 {
+		return nil
+	}
+	return b[:len(b)-4]
+}
+
+// RestoreIndexHeaders will index restore headers removed by RemoveIndexHeaders.
+// No error checking is performed on the input.
+// If a 0 length slice is sent, it is returned without modification.
+func RestoreIndexHeaders(in []byte) []byte {
+	if len(in) == 0 {
+		return in
+	}
+	b := make([]byte, 0, 4+len(S2IndexHeader)+len(in)+len(S2IndexTrailer)+4)
+	b = append(b, ChunkTypeIndex, 0, 0, 0)
+	b = append(b, []byte(S2IndexHeader)...)
+	b = append(b, in...)
+
+	var tmp [4]byte
+	binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)+4+len(S2IndexTrailer)))
+	b = append(b, tmp[:4]...)
+	// Trailer
+	b = append(b, []byte(S2IndexTrailer)...)
+
+	chunkLen := len(b) - skippableFrameHeader
+	b[1] = uint8(chunkLen >> 0)
+	b[2] = uint8(chunkLen >> 8)
+	b[3] = uint8(chunkLen >> 16)
+	return b
+}
diff --git a/vendor/github.com/klauspost/compress/s2/lz4convert.go b/vendor/github.com/klauspost/compress/s2/lz4convert.go
new file mode 100644
index 0000000000..46ed908e3c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/lz4convert.go
@@ -0,0 +1,585 @@
+// Copyright (c) 2022 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+)
+
+// LZ4Converter provides conversion from LZ4 blocks as defined here:
+// https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md
+type LZ4Converter struct {
+}
+
+// ErrDstTooSmall is returned when provided destination is too small.
+var ErrDstTooSmall = errors.New("s2: destination too small")
+
+// ConvertBlock will convert an LZ4 block and append it as an S2
+// block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4Converter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
+	if len(src) == 0 {
+		return dst, 0, nil
+	}
+	const debug = false
+	const inline = true
+	const lz4MinMatch = 4
+
+	s, d := 0, len(dst)
+	dst = dst[:cap(dst)]
+	if !debug && hasAmd64Asm {
+		res, sz := cvtLZ4BlockAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt     = -1
+				errDstTooSmall = -2
+			)
+			switch res {
+			case errCorrupt:
+				return nil, 0, ErrCorrupt
+			case errDstTooSmall:
+				return nil, 0, ErrDstTooSmall
+			default:
+				return nil, 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if d+sz > len(dst) {
+			return nil, 0, ErrDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
+	dLimit := len(dst) - 10
+	var lastOffset uint16
+	var uncompressed int
+	if debug {
+		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+	}
+
+	for {
+		if s >= len(src) {
+			return dst[:d], 0, ErrCorrupt
+		}
+		// Read literal info
+		token := src[s]
+		ll := int(token >> 4)
+		ml := int(lz4MinMatch + (token & 0xf))
+
+		// If upper nibble is 15, literal length is extended
+		if token >= 0xf0 {
+			for {
+				s++
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return dst[:d], 0, ErrCorrupt
+				}
+				val := src[s]
+				ll += int(val)
+				if val != 255 {
+					break
+				}
+			}
+		}
+		// Skip past token
+		if s+ll >= len(src) {
+			if debug {
+				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+			}
+			return nil, 0, ErrCorrupt
+		}
+		s++
+		if ll > 0 {
+			if d+ll > dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+			if debug {
+				fmt.Printf("emit %d literals\n", ll)
+			}
+			d += emitLiteralGo(dst[d:], src[s:s+ll])
+			s += ll
+			uncompressed += ll
+		}
+
+		// Check if we are done...
+		if s == len(src) && ml == lz4MinMatch {
+			break
+		}
+		// 2 byte offset
+		if s >= len(src)-2 {
+			if debug {
+				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		offset := binary.LittleEndian.Uint16(src[s:])
+		s += 2
+		if offset == 0 {
+			if debug {
+				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		if int(offset) > uncompressed {
+			if debug {
+				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+			}
+			return nil, 0, ErrCorrupt
+		}
+
+		if ml == lz4MinMatch+15 {
+			for {
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				s++
+				ml += int(val)
+				if val != 255 {
+					if s >= len(src) {
+						if debug {
+							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+						}
+						return nil, 0, ErrCorrupt
+					}
+					break
+				}
+			}
+		}
+		if offset == lastOffset {
+			if debug {
+				fmt.Printf("emit repeat, length: %d, offset: %d\n", ml, offset)
+			}
+			if !inline {
+				d += emitRepeat16(dst[d:], offset, ml)
+			} else {
+				length := ml
+				dst := dst[d:]
+				for len(dst) > 5 {
+					// Repeat offset, make length cheaper
+					length -= 4
+					if length <= 4 {
+						dst[0] = uint8(length)<<2 | tagCopy1
+						dst[1] = 0
+						d += 2
+						break
+					}
+					if length < 8 && offset < 2048 {
+						// Encode WITH offset
+						dst[1] = uint8(offset)
+						dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+						d += 2
+						break
+					}
+					if length < (1<<8)+4 {
+						length -= 4
+						dst[2] = uint8(length)
+						dst[1] = 0
+						dst[0] = 5<<2 | tagCopy1
+						d += 3
+						break
+					}
+					if length < (1<<16)+(1<<8) {
+						length -= 1 << 8
+						dst[3] = uint8(length >> 8)
+						dst[2] = uint8(length >> 0)
+						dst[1] = 0
+						dst[0] = 6<<2 | tagCopy1
+						d += 4
+						break
+					}
+					const maxRepeat = (1 << 24) - 1
+					length -= 1 << 16
+					left := 0
+					if length > maxRepeat {
+						left = length - maxRepeat + 4
+						length = maxRepeat - 4
+					}
+					dst[4] = uint8(length >> 16)
+					dst[3] = uint8(length >> 8)
+					dst[2] = uint8(length >> 0)
+					dst[1] = 0
+					dst[0] = 7<<2 | tagCopy1
+					if left > 0 {
+						d += 5 + emitRepeat16(dst[5:], offset, left)
+						break
+					}
+					d += 5
+					break
+				}
+			}
+		} else {
+			if debug {
+				fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+			}
+			if !inline {
+				d += emitCopy16(dst[d:], offset, ml)
+			} else {
+				length := ml
+				dst := dst[d:]
+				for len(dst) > 5 {
+					// Offset no more than 2 bytes.
+					if length > 64 {
+						off := 3
+						if offset < 2048 {
+							// emit 8 bytes as tagCopy1, rest as repeats.
+							dst[1] = uint8(offset)
+							dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+							length -= 8
+							off = 2
+						} else {
+							// Emit a length 60 copy, encoded as 3 bytes.
+							// Emit remaining as repeat value (minimum 4 bytes).
+							dst[2] = uint8(offset >> 8)
+							dst[1] = uint8(offset)
+							dst[0] = 59<<2 | tagCopy2
+							length -= 60
+						}
+						// Emit remaining as repeats, at least 4 bytes remain.
+						d += off + emitRepeat16(dst[off:], offset, length)
+						break
+					}
+					if length >= 12 || offset >= 2048 {
+						// Emit the remaining copy, encoded as 3 bytes.
+						dst[2] = uint8(offset >> 8)
+						dst[1] = uint8(offset)
+						dst[0] = uint8(length-1)<<2 | tagCopy2
+						d += 3
+						break
+					}
+					// Emit the remaining copy, encoded as 2 bytes.
+					dst[1] = uint8(offset)
+					dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+					d += 2
+					break
+				}
+			}
+			lastOffset = offset
+		}
+		uncompressed += ml
+		if d > dLimit {
+			return nil, 0, ErrDstTooSmall
+		}
+	}
+
+	return dst[:d], uncompressed, nil
+}
+
+// ConvertBlockSnappy will convert an LZ4 block and append it
+// as a Snappy block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4Converter) ConvertBlockSnappy(dst, src []byte) ([]byte, int, error) {
+	if len(src) == 0 {
+		return dst, 0, nil
+	}
+	const debug = false
+	const lz4MinMatch = 4
+
+	s, d := 0, len(dst)
+	dst = dst[:cap(dst)]
+	// Use assembly when possible
+	if !debug && hasAmd64Asm {
+		res, sz := cvtLZ4BlockSnappyAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt     = -1
+				errDstTooSmall = -2
+			)
+			switch res {
+			case errCorrupt:
+				return nil, 0, ErrCorrupt
+			case errDstTooSmall:
+				return nil, 0, ErrDstTooSmall
+			default:
+				return nil, 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if d+sz > len(dst) {
+			return nil, 0, ErrDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
+	dLimit := len(dst) - 10
+	var uncompressed int
+	if debug {
+		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+	}
+
+	for {
+		if s >= len(src) {
+			return nil, 0, ErrCorrupt
+		}
+		// Read literal info
+		token := src[s]
+		ll := int(token >> 4)
+		ml := int(lz4MinMatch + (token & 0xf))
+
+		// If upper nibble is 15, literal length is extended
+		if token >= 0xf0 {
+			for {
+				s++
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				ll += int(val)
+				if val != 255 {
+					break
+				}
+			}
+		}
+		// Skip past token
+		if s+ll >= len(src) {
+			if debug {
+				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+			}
+			return nil, 0, ErrCorrupt
+		}
+		s++
+		if ll > 0 {
+			if d+ll > dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+			if debug {
+				fmt.Printf("emit %d literals\n", ll)
+			}
+			d += emitLiteralGo(dst[d:], src[s:s+ll])
+			s += ll
+			uncompressed += ll
+		}
+
+		// Check if we are done...
+		if s == len(src) && ml == lz4MinMatch {
+			break
+		}
+		// 2 byte offset
+		if s >= len(src)-2 {
+			if debug {
+				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		offset := binary.LittleEndian.Uint16(src[s:])
+		s += 2
+		if offset == 0 {
+			if debug {
+				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		if int(offset) > uncompressed {
+			if debug {
+				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+			}
+			return nil, 0, ErrCorrupt
+		}
+
+		if ml == lz4MinMatch+15 {
+			for {
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				s++
+				ml += int(val)
+				if val != 255 {
+					if s >= len(src) {
+						if debug {
+							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+						}
+						return nil, 0, ErrCorrupt
+					}
+					break
+				}
+			}
+		}
+		if debug {
+			fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+		}
+		length := ml
+		// d += emitCopyNoRepeat(dst[d:], int(offset), ml)
+		for length > 0 {
+			if d >= dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+
+			// Offset no more than 2 bytes.
+			if length > 64 {
+				// Emit a length 64 copy, encoded as 3 bytes.
+				dst[d+2] = uint8(offset >> 8)
+				dst[d+1] = uint8(offset)
+				dst[d+0] = 63<<2 | tagCopy2
+				length -= 64
+				d += 3
+				continue
+			}
+			if length >= 12 || offset >= 2048 || length < 4 {
+				// Emit the remaining copy, encoded as 3 bytes.
+				dst[d+2] = uint8(offset >> 8)
+				dst[d+1] = uint8(offset)
+				dst[d+0] = uint8(length-1)<<2 | tagCopy2
+				d += 3
+				break
+			}
+			// Emit the remaining copy, encoded as 2 bytes.
+			dst[d+1] = uint8(offset)
+			dst[d+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+			d += 2
+			break
+		}
+		uncompressed += ml
+		if d > dLimit {
+			return nil, 0, ErrDstTooSmall
+		}
+	}
+
+	return dst[:d], uncompressed, nil
+}
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<24
+func emitRepeat16(dst []byte, offset uint16, length int) int {
+	// Repeat offset, make length cheaper
+	length -= 4
+	if length <= 4 {
+		dst[0] = uint8(length)<<2 | tagCopy1
+		dst[1] = 0
+		return 2
+	}
+	if length < 8 && offset < 2048 {
+		// Encode WITH offset
+		dst[1] = uint8(offset)
+		dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+		return 2
+	}
+	if length < (1<<8)+4 {
+		length -= 4
+		dst[2] = uint8(length)
+		dst[1] = 0
+		dst[0] = 5<<2 | tagCopy1
+		return 3
+	}
+	if length < (1<<16)+(1<<8) {
+		length -= 1 << 8
+		dst[3] = uint8(length >> 8)
+		dst[2] = uint8(length >> 0)
+		dst[1] = 0
+		dst[0] = 6<<2 | tagCopy1
+		return 4
+	}
+	const maxRepeat = (1 << 24) - 1
+	length -= 1 << 16
+	left := 0
+	if length > maxRepeat {
+		left = length - maxRepeat + 4
+		length = maxRepeat - 4
+	}
+	dst[4] = uint8(length >> 16)
+	dst[3] = uint8(length >> 8)
+	dst[2] = uint8(length >> 0)
+	dst[1] = 0
+	dst[0] = 7<<2 | tagCopy1
+	if left > 0 {
+		return 5 + emitRepeat16(dst[5:], offset, left)
+	}
+	return 5
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint16
+//	4 <= length && length <= math.MaxUint32
+func emitCopy16(dst []byte, offset uint16, length int) int {
+	// Offset no more than 2 bytes.
+	if length > 64 {
+		off := 3
+		if offset < 2048 {
+			// emit 8 bytes as tagCopy1, rest as repeats.
+			dst[1] = uint8(offset)
+			dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+			length -= 8
+			off = 2
+		} else {
+			// Emit a length 60 copy, encoded as 3 bytes.
+			// Emit remaining as repeat value (minimum 4 bytes).
+			dst[2] = uint8(offset >> 8)
+			dst[1] = uint8(offset)
+			dst[0] = 59<<2 | tagCopy2
+			length -= 60
+		}
+		// Emit remaining as repeats, at least 4 bytes remain.
+		return off + emitRepeat16(dst[off:], offset, length)
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[2] = uint8(offset >> 8)
+		dst[1] = uint8(offset)
+		dst[0] = uint8(length-1)<<2 | tagCopy2
+		return 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[1] = uint8(offset)
+	dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	return 2
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
+func emitLiteralGo(dst, lit []byte) int {
+	if len(lit) == 0 {
+		return 0
+	}
+	i, n := 0, uint(len(lit)-1)
+	switch {
+	case n < 60:
+		dst[0] = uint8(n)<<2 | tagLiteral
+		i = 1
+	case n < 1<<8:
+		dst[1] = uint8(n)
+		dst[0] = 60<<2 | tagLiteral
+		i = 2
+	case n < 1<<16:
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 61<<2 | tagLiteral
+		i = 3
+	case n < 1<<24:
+		dst[3] = uint8(n >> 16)
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 62<<2 | tagLiteral
+		i = 4
+	default:
+		dst[4] = uint8(n >> 24)
+		dst[3] = uint8(n >> 16)
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 63<<2 | tagLiteral
+		i = 5
+	}
+	return i + copy(dst[i:], lit)
+}
diff --git a/vendor/github.com/klauspost/compress/s2/lz4sconvert.go b/vendor/github.com/klauspost/compress/s2/lz4sconvert.go
new file mode 100644
index 0000000000..000f39719c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/lz4sconvert.go
@@ -0,0 +1,467 @@
+// Copyright (c) 2022 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"encoding/binary"
+	"fmt"
+)
+
+// LZ4sConverter provides conversion from LZ4s.
+// (Intel modified LZ4 Blocks)
+// https://cdrdv2-public.intel.com/743912/743912-qat-programmers-guide-v2.0.pdf
+// LZ4s is a variant of LZ4 block format. LZ4s should be considered as an intermediate compressed block format.
+// The LZ4s format is selected when the application sets the compType to CPA_DC_LZ4S in CpaDcSessionSetupData.
+// The LZ4s block returned by the Intel® QAT hardware can be used by an external
+// software post-processing to generate other compressed data formats.
+// The following table lists the differences between LZ4 and LZ4s block format. LZ4s block format uses
+// the same high-level formatting as LZ4 block format with the following encoding changes:
+// For Min Match of 4 bytes, Copy length value 1-15 means length 4-18 with 18 bytes adding an extra byte.
+// ONLY "Min match of 4 bytes" is supported.
+type LZ4sConverter struct {
+}
+
+// ConvertBlock will convert an LZ4s block and append it as an S2
+// block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4sConverter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
+	if len(src) == 0 {
+		return dst, 0, nil
+	}
+	const debug = false
+	const inline = true
+	const lz4MinMatch = 3
+
+	s, d := 0, len(dst)
+	dst = dst[:cap(dst)]
+	if !debug && hasAmd64Asm {
+		res, sz := cvtLZ4sBlockAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt     = -1
+				errDstTooSmall = -2
+			)
+			switch res {
+			case errCorrupt:
+				return nil, 0, ErrCorrupt
+			case errDstTooSmall:
+				return nil, 0, ErrDstTooSmall
+			default:
+				return nil, 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if d+sz > len(dst) {
+			return nil, 0, ErrDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
+	dLimit := len(dst) - 10
+	var lastOffset uint16
+	var uncompressed int
+	if debug {
+		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+	}
+
+	for {
+		if s >= len(src) {
+			return dst[:d], 0, ErrCorrupt
+		}
+		// Read literal info
+		token := src[s]
+		ll := int(token >> 4)
+		ml := int(lz4MinMatch + (token & 0xf))
+
+		// If upper nibble is 15, literal length is extended
+		if token >= 0xf0 {
+			for {
+				s++
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return dst[:d], 0, ErrCorrupt
+				}
+				val := src[s]
+				ll += int(val)
+				if val != 255 {
+					break
+				}
+			}
+		}
+		// Skip past token
+		if s+ll >= len(src) {
+			if debug {
+				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+			}
+			return nil, 0, ErrCorrupt
+		}
+		s++
+		if ll > 0 {
+			if d+ll > dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+			if debug {
+				fmt.Printf("emit %d literals\n", ll)
+			}
+			d += emitLiteralGo(dst[d:], src[s:s+ll])
+			s += ll
+			uncompressed += ll
+		}
+
+		// Check if we are done...
+		if ml == lz4MinMatch {
+			if s == len(src) {
+				break
+			}
+			// 0 bytes.
+			continue
+		}
+		// 2 byte offset
+		if s >= len(src)-2 {
+			if debug {
+				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		offset := binary.LittleEndian.Uint16(src[s:])
+		s += 2
+		if offset == 0 {
+			if debug {
+				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		if int(offset) > uncompressed {
+			if debug {
+				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+			}
+			return nil, 0, ErrCorrupt
+		}
+
+		if ml == lz4MinMatch+15 {
+			for {
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				s++
+				ml += int(val)
+				if val != 255 {
+					if s >= len(src) {
+						if debug {
+							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+						}
+						return nil, 0, ErrCorrupt
+					}
+					break
+				}
+			}
+		}
+		if offset == lastOffset {
+			if debug {
+				fmt.Printf("emit repeat, length: %d, offset: %d\n", ml, offset)
+			}
+			if !inline {
+				d += emitRepeat16(dst[d:], offset, ml)
+			} else {
+				length := ml
+				dst := dst[d:]
+				for len(dst) > 5 {
+					// Repeat offset, make length cheaper
+					length -= 4
+					if length <= 4 {
+						dst[0] = uint8(length)<<2 | tagCopy1
+						dst[1] = 0
+						d += 2
+						break
+					}
+					if length < 8 && offset < 2048 {
+						// Encode WITH offset
+						dst[1] = uint8(offset)
+						dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
+						d += 2
+						break
+					}
+					if length < (1<<8)+4 {
+						length -= 4
+						dst[2] = uint8(length)
+						dst[1] = 0
+						dst[0] = 5<<2 | tagCopy1
+						d += 3
+						break
+					}
+					if length < (1<<16)+(1<<8) {
+						length -= 1 << 8
+						dst[3] = uint8(length >> 8)
+						dst[2] = uint8(length >> 0)
+						dst[1] = 0
+						dst[0] = 6<<2 | tagCopy1
+						d += 4
+						break
+					}
+					const maxRepeat = (1 << 24) - 1
+					length -= 1 << 16
+					left := 0
+					if length > maxRepeat {
+						left = length - maxRepeat + 4
+						length = maxRepeat - 4
+					}
+					dst[4] = uint8(length >> 16)
+					dst[3] = uint8(length >> 8)
+					dst[2] = uint8(length >> 0)
+					dst[1] = 0
+					dst[0] = 7<<2 | tagCopy1
+					if left > 0 {
+						d += 5 + emitRepeat16(dst[5:], offset, left)
+						break
+					}
+					d += 5
+					break
+				}
+			}
+		} else {
+			if debug {
+				fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+			}
+			if !inline {
+				d += emitCopy16(dst[d:], offset, ml)
+			} else {
+				length := ml
+				dst := dst[d:]
+				for len(dst) > 5 {
+					// Offset no more than 2 bytes.
+					if length > 64 {
+						off := 3
+						if offset < 2048 {
+							// emit 8 bytes as tagCopy1, rest as repeats.
+							dst[1] = uint8(offset)
+							dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
+							length -= 8
+							off = 2
+						} else {
+							// Emit a length 60 copy, encoded as 3 bytes.
+							// Emit remaining as repeat value (minimum 4 bytes).
+							dst[2] = uint8(offset >> 8)
+							dst[1] = uint8(offset)
+							dst[0] = 59<<2 | tagCopy2
+							length -= 60
+						}
+						// Emit remaining as repeats, at least 4 bytes remain.
+						d += off + emitRepeat16(dst[off:], offset, length)
+						break
+					}
+					if length >= 12 || offset >= 2048 {
+						// Emit the remaining copy, encoded as 3 bytes.
+						dst[2] = uint8(offset >> 8)
+						dst[1] = uint8(offset)
+						dst[0] = uint8(length-1)<<2 | tagCopy2
+						d += 3
+						break
+					}
+					// Emit the remaining copy, encoded as 2 bytes.
+					dst[1] = uint8(offset)
+					dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+					d += 2
+					break
+				}
+			}
+			lastOffset = offset
+		}
+		uncompressed += ml
+		if d > dLimit {
+			return nil, 0, ErrDstTooSmall
+		}
+	}
+
+	return dst[:d], uncompressed, nil
+}
+
+// ConvertBlockSnappy will convert an LZ4s block and append it
+// as a Snappy block without block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block.
+func (l *LZ4sConverter) ConvertBlockSnappy(dst, src []byte) ([]byte, int, error) {
+	if len(src) == 0 {
+		return dst, 0, nil
+	}
+	const debug = false
+	const lz4MinMatch = 3
+
+	s, d := 0, len(dst)
+	dst = dst[:cap(dst)]
+	// Use assembly when possible
+	if !debug && hasAmd64Asm {
+		res, sz := cvtLZ4sBlockSnappyAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt     = -1
+				errDstTooSmall = -2
+			)
+			switch res {
+			case errCorrupt:
+				return nil, 0, ErrCorrupt
+			case errDstTooSmall:
+				return nil, 0, ErrDstTooSmall
+			default:
+				return nil, 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if d+sz > len(dst) {
+			return nil, 0, ErrDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
+	dLimit := len(dst) - 10
+	var uncompressed int
+	if debug {
+		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+	}
+
+	for {
+		if s >= len(src) {
+			return nil, 0, ErrCorrupt
+		}
+		// Read literal info
+		token := src[s]
+		ll := int(token >> 4)
+		ml := int(lz4MinMatch + (token & 0xf))
+
+		// If upper nibble is 15, literal length is extended
+		if token >= 0xf0 {
+			for {
+				s++
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				ll += int(val)
+				if val != 255 {
+					break
+				}
+			}
+		}
+		// Skip past token
+		if s+ll >= len(src) {
+			if debug {
+				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+			}
+			return nil, 0, ErrCorrupt
+		}
+		s++
+		if ll > 0 {
+			if d+ll > dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+			if debug {
+				fmt.Printf("emit %d literals\n", ll)
+			}
+			d += emitLiteralGo(dst[d:], src[s:s+ll])
+			s += ll
+			uncompressed += ll
+		}
+
+		// Check if we are done...
+		if ml == lz4MinMatch {
+			if s == len(src) {
+				break
+			}
+			// 0 bytes.
+			continue
+		}
+		// 2 byte offset
+		if s >= len(src)-2 {
+			if debug {
+				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		offset := binary.LittleEndian.Uint16(src[s:])
+		s += 2
+		if offset == 0 {
+			if debug {
+				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		if int(offset) > uncompressed {
+			if debug {
+				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+			}
+			return nil, 0, ErrCorrupt
+		}
+
+		if ml == lz4MinMatch+15 {
+			for {
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				s++
+				ml += int(val)
+				if val != 255 {
+					if s >= len(src) {
+						if debug {
+							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+						}
+						return nil, 0, ErrCorrupt
+					}
+					break
+				}
+			}
+		}
+		if debug {
+			fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
+		}
+		length := ml
+		// d += emitCopyNoRepeat(dst[d:], int(offset), ml)
+		for length > 0 {
+			if d >= dLimit {
+				return nil, 0, ErrDstTooSmall
+			}
+
+			// Offset no more than 2 bytes.
+			if length > 64 {
+				// Emit a length 64 copy, encoded as 3 bytes.
+				dst[d+2] = uint8(offset >> 8)
+				dst[d+1] = uint8(offset)
+				dst[d+0] = 63<<2 | tagCopy2
+				length -= 64
+				d += 3
+				continue
+			}
+			if length >= 12 || offset >= 2048 || length < 4 {
+				// Emit the remaining copy, encoded as 3 bytes.
+				dst[d+2] = uint8(offset >> 8)
+				dst[d+1] = uint8(offset)
+				dst[d+0] = uint8(length-1)<<2 | tagCopy2
+				d += 3
+				break
+			}
+			// Emit the remaining copy, encoded as 2 bytes.
+			dst[d+1] = uint8(offset)
+			dst[d+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+			d += 2
+			break
+		}
+		uncompressed += ml
+		if d > dLimit {
+			return nil, 0, ErrDstTooSmall
+		}
+	}
+
+	return dst[:d], uncompressed, nil
+}
diff --git a/vendor/github.com/klauspost/compress/s2/reader.go b/vendor/github.com/klauspost/compress/s2/reader.go
new file mode 100644
index 0000000000..8372d752f9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/reader.go
@@ -0,0 +1,1075 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019+ Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"math"
+	"runtime"
+	"sync"
+)
+
+// ErrCantSeek is returned if the stream cannot be seeked.
+type ErrCantSeek struct {
+	Reason string
+}
+
+// Error returns the error as string.
+func (e ErrCantSeek) Error() string {
+	return fmt.Sprintf("s2: Can't seek because %s", e.Reason)
+}
+
+// NewReader returns a new Reader that decompresses from r, using the framing
+// format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt with S2 changes.
+func NewReader(r io.Reader, opts ...ReaderOption) *Reader {
+	nr := Reader{
+		r:        r,
+		maxBlock: maxBlockSize,
+	}
+	for _, opt := range opts {
+		if err := opt(&nr); err != nil {
+			nr.err = err
+			return &nr
+		}
+	}
+	nr.maxBufSize = MaxEncodedLen(nr.maxBlock) + checksumSize
+	if nr.lazyBuf > 0 {
+		nr.buf = make([]byte, MaxEncodedLen(nr.lazyBuf)+checksumSize)
+	} else {
+		nr.buf = make([]byte, MaxEncodedLen(defaultBlockSize)+checksumSize)
+	}
+	nr.readHeader = nr.ignoreStreamID
+	nr.paramsOK = true
+	return &nr
+}
+
+// ReaderOption is an option for creating a decoder.
+type ReaderOption func(*Reader) error
+
+// ReaderMaxBlockSize allows to control allocations if the stream
+// has been compressed with a smaller WriterBlockSize, or with the default 1MB.
+// Blocks must be this size or smaller to decompress,
+// otherwise the decoder will return ErrUnsupported.
+//
+// For streams compressed with Snappy this can safely be set to 64KB (64 << 10).
+//
+// Default is the maximum limit of 4MB.
+func ReaderMaxBlockSize(blockSize int) ReaderOption {
+	return func(r *Reader) error {
+		if blockSize > maxBlockSize || blockSize <= 0 {
+			return errors.New("s2: block size too large. Must be <= 4MB and > 0")
+		}
+		if r.lazyBuf == 0 && blockSize < defaultBlockSize {
+			r.lazyBuf = blockSize
+		}
+		r.maxBlock = blockSize
+		return nil
+	}
+}
+
+// ReaderAllocBlock allows to control upfront stream allocations
+// and not allocate for frames bigger than this initially.
+// If frames bigger than this is seen a bigger buffer will be allocated.
+//
+// Default is 1MB, which is default output size.
+func ReaderAllocBlock(blockSize int) ReaderOption {
+	return func(r *Reader) error {
+		if blockSize > maxBlockSize || blockSize < 1024 {
+			return errors.New("s2: invalid ReaderAllocBlock. Must be <= 4MB and >= 1024")
+		}
+		r.lazyBuf = blockSize
+		return nil
+	}
+}
+
+// ReaderIgnoreStreamIdentifier will make the reader skip the expected
+// stream identifier at the beginning of the stream.
+// This can be used when serving a stream that has been forwarded to a specific point.
+func ReaderIgnoreStreamIdentifier() ReaderOption {
+	return func(r *Reader) error {
+		r.ignoreStreamID = true
+		return nil
+	}
+}
+
+// ReaderSkippableCB will register a callback for chuncks with the specified ID.
+// ID must be a Reserved skippable chunks ID, 0x80-0xfd (inclusive).
+// For each chunk with the ID, the callback is called with the content.
+// Any returned non-nil error will abort decompression.
+// Only one callback per ID is supported, latest sent will be used.
+// You can peek the stream, triggering the callback, by doing a Read with a 0
+// byte buffer.
+func ReaderSkippableCB(id uint8, fn func(r io.Reader) error) ReaderOption {
+	return func(r *Reader) error {
+		if id < 0x80 || id > 0xfd {
+			return fmt.Errorf("ReaderSkippableCB: Invalid id provided, must be 0x80-0xfd (inclusive)")
+		}
+		r.skippableCB[id-0x80] = fn
+		return nil
+	}
+}
+
+// ReaderIgnoreCRC will make the reader skip CRC calculation and checks.
+func ReaderIgnoreCRC() ReaderOption {
+	return func(r *Reader) error {
+		r.ignoreCRC = true
+		return nil
+	}
+}
+
+// Reader is an io.Reader that can read Snappy-compressed bytes.
+type Reader struct {
+	r           io.Reader
+	err         error
+	decoded     []byte
+	buf         []byte
+	skippableCB [0xff - 0x80]func(r io.Reader) error
+	blockStart  int64 // Uncompressed offset at start of current.
+	index       *Index
+
+	// decoded[i:j] contains decoded bytes that have not yet been passed on.
+	i, j int
+	// maximum block size allowed.
+	maxBlock int
+	// maximum expected buffer size.
+	maxBufSize int
+	// alloc a buffer this size if > 0.
+	lazyBuf        int
+	readHeader     bool
+	paramsOK       bool
+	snappyFrame    bool
+	ignoreStreamID bool
+	ignoreCRC      bool
+}
+
+// GetBufferCapacity returns the capacity of the internal buffer.
+// This might be useful to know when reusing the same reader in combination
+// with the lazy buffer option.
+func (r *Reader) GetBufferCapacity() int {
+	return cap(r.buf)
+}
+
+// ensureBufferSize will ensure that the buffer can take at least n bytes.
+// If false is returned the buffer exceeds maximum allowed size.
+func (r *Reader) ensureBufferSize(n int) bool {
+	if n > r.maxBufSize {
+		r.err = ErrCorrupt
+		return false
+	}
+	if cap(r.buf) >= n {
+		return true
+	}
+	// Realloc buffer.
+	r.buf = make([]byte, n)
+	return true
+}
+
+// Reset discards any buffered data, resets all state, and switches the Snappy
+// reader to read from r. This permits reusing a Reader rather than allocating
+// a new one.
+func (r *Reader) Reset(reader io.Reader) {
+	if !r.paramsOK {
+		return
+	}
+	r.index = nil
+	r.r = reader
+	r.err = nil
+	r.i = 0
+	r.j = 0
+	r.blockStart = 0
+	r.readHeader = r.ignoreStreamID
+}
+
+func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
+	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
+		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrCorrupt
+		}
+		return false
+	}
+	return true
+}
+
+// skippable will skip n bytes.
+// If the supplied reader supports seeking that is used.
+// tmp is used as a temporary buffer for reading.
+// The supplied slice does not need to be the size of the read.
+func (r *Reader) skippable(tmp []byte, n int, allowEOF bool, id uint8) (ok bool) {
+	if id < 0x80 {
+		r.err = fmt.Errorf("internal error: skippable id < 0x80")
+		return false
+	}
+	if fn := r.skippableCB[id-0x80]; fn != nil {
+		rd := io.LimitReader(r.r, int64(n))
+		r.err = fn(rd)
+		if r.err != nil {
+			return false
+		}
+		_, r.err = io.CopyBuffer(ioutil.Discard, rd, tmp)
+		return r.err == nil
+	}
+	if rs, ok := r.r.(io.ReadSeeker); ok {
+		_, err := rs.Seek(int64(n), io.SeekCurrent)
+		if err == nil {
+			return true
+		}
+		if err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrCorrupt
+			return false
+		}
+	}
+	for n > 0 {
+		if n < len(tmp) {
+			tmp = tmp[:n]
+		}
+		if _, r.err = io.ReadFull(r.r, tmp); r.err != nil {
+			if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+				r.err = ErrCorrupt
+			}
+			return false
+		}
+		n -= len(tmp)
+	}
+	return true
+}
+
+// Read satisfies the io.Reader interface.
+func (r *Reader) Read(p []byte) (int, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	for {
+		if r.i < r.j {
+			n := copy(p, r.decoded[r.i:r.j])
+			r.i += n
+			return n, nil
+		}
+		if !r.readFull(r.buf[:4], true) {
+			return 0, r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrUnsupported
+				}
+				return 0, r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+
+			if n > len(r.decoded) {
+				if n > r.maxBlock {
+					r.err = ErrCorrupt
+					return 0, r.err
+				}
+				r.decoded = make([]byte, n)
+			}
+			if _, err := Decode(r.decoded, buf); err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {
+				r.err = ErrCRC
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeUncompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrUnsupported
+				}
+				return 0, r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n := chunkLen - checksumSize
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if n > len(r.decoded) {
+				if n > r.maxBlock {
+					r.err = ErrCorrupt
+					return 0, r.err
+				}
+				r.decoded = make([]byte, n)
+			}
+			if !r.readFull(r.decoded[:n], false) {
+				return 0, r.err
+			}
+			if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {
+				r.err = ErrCRC
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return 0, r.err
+			}
+			if string(r.buf[:len(magicBody)]) != magicBody {
+				if string(r.buf[:len(magicBody)]) != magicBodySnappy {
+					r.err = ErrCorrupt
+					return 0, r.err
+				} else {
+					r.snappyFrame = true
+				}
+			} else {
+				r.snappyFrame = false
+			}
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			// fmt.Printf("ERR chunktype: 0x%x\n", chunkType)
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if chunkLen > maxChunkSize {
+			// fmt.Printf("ERR chunkLen: 0x%x\n", chunkLen)
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+
+		// fmt.Printf("skippable: ID: 0x%x, len: 0x%x\n", chunkType, chunkLen)
+		if !r.skippable(r.buf, chunkLen, false, chunkType) {
+			return 0, r.err
+		}
+	}
+}
+
+// DecodeConcurrent will decode the full stream to w.
+// This function should not be combined with reading, seeking or other operations.
+// Up to 'concurrent' goroutines will be used.
+// If <= 0, runtime.NumCPU will be used.
+// On success the number of bytes decompressed nil and is returned.
+// This is mainly intended for bigger streams.
+func (r *Reader) DecodeConcurrent(w io.Writer, concurrent int) (written int64, err error) {
+	if r.i > 0 || r.j > 0 || r.blockStart > 0 {
+		return 0, errors.New("DecodeConcurrent called after ")
+	}
+	if concurrent <= 0 {
+		concurrent = runtime.NumCPU()
+	}
+
+	// Write to output
+	var errMu sync.Mutex
+	var aErr error
+	setErr := func(e error) (ok bool) {
+		errMu.Lock()
+		defer errMu.Unlock()
+		if e == nil {
+			return aErr == nil
+		}
+		if aErr == nil {
+			aErr = e
+		}
+		return false
+	}
+	hasErr := func() (ok bool) {
+		errMu.Lock()
+		v := aErr != nil
+		errMu.Unlock()
+		return v
+	}
+
+	var aWritten int64
+	toRead := make(chan []byte, concurrent)
+	writtenBlocks := make(chan []byte, concurrent)
+	queue := make(chan chan []byte, concurrent)
+	reUse := make(chan chan []byte, concurrent)
+	for i := 0; i < concurrent; i++ {
+		toRead <- make([]byte, 0, r.maxBufSize)
+		writtenBlocks <- make([]byte, 0, r.maxBufSize)
+		reUse <- make(chan []byte, 1)
+	}
+	// Writer
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for toWrite := range queue {
+			entry := <-toWrite
+			reUse <- toWrite
+			if hasErr() || entry == nil {
+				if entry != nil {
+					writtenBlocks <- entry
+				}
+				continue
+			}
+			if hasErr() {
+				writtenBlocks <- entry
+				continue
+			}
+			n, err := w.Write(entry)
+			want := len(entry)
+			writtenBlocks <- entry
+			if err != nil {
+				setErr(err)
+				continue
+			}
+			if n != want {
+				setErr(io.ErrShortWrite)
+				continue
+			}
+			aWritten += int64(n)
+		}
+	}()
+
+	defer func() {
+		if r.err != nil {
+			setErr(r.err)
+		} else if err != nil {
+			setErr(err)
+		}
+		close(queue)
+		wg.Wait()
+		if err == nil {
+			err = aErr
+		}
+		written = aWritten
+	}()
+
+	// Reader
+	for !hasErr() {
+		if !r.readFull(r.buf[:4], true) {
+			if r.err == io.EOF {
+				r.err = nil
+			}
+			return 0, r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if chunkLen > r.maxBufSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			orgBuf := <-toRead
+			buf := orgBuf[:chunkLen]
+
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+
+			if n > r.maxBlock {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			wg.Add(1)
+
+			decoded := <-writtenBlocks
+			entry := <-reUse
+			queue <- entry
+			go func() {
+				defer wg.Done()
+				decoded = decoded[:n]
+				_, err := Decode(decoded, buf)
+				toRead <- orgBuf
+				if err != nil {
+					writtenBlocks <- decoded
+					setErr(err)
+					entry <- nil
+					return
+				}
+				if !r.ignoreCRC && crc(decoded) != checksum {
+					writtenBlocks <- decoded
+					setErr(ErrCRC)
+					entry <- nil
+					return
+				}
+				entry <- decoded
+			}()
+			continue
+
+		case chunkTypeUncompressedData:
+
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if chunkLen > r.maxBufSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			// Grab write buffer
+			orgBuf := <-writtenBlocks
+			buf := orgBuf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read content.
+			n := chunkLen - checksumSize
+
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if n > r.maxBlock {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			// Read uncompressed
+			buf = orgBuf[:n]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+
+			if !r.ignoreCRC && crc(buf) != checksum {
+				r.err = ErrCRC
+				return 0, r.err
+			}
+			entry := <-reUse
+			queue <- entry
+			entry <- buf
+			continue
+
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return 0, r.err
+			}
+			if string(r.buf[:len(magicBody)]) != magicBody {
+				if string(r.buf[:len(magicBody)]) != magicBodySnappy {
+					r.err = ErrCorrupt
+					return 0, r.err
+				} else {
+					r.snappyFrame = true
+				}
+			} else {
+				r.snappyFrame = false
+			}
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			// fmt.Printf("ERR chunktype: 0x%x\n", chunkType)
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if chunkLen > maxChunkSize {
+			// fmt.Printf("ERR chunkLen: 0x%x\n", chunkLen)
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+
+		// fmt.Printf("skippable: ID: 0x%x, len: 0x%x\n", chunkType, chunkLen)
+		if !r.skippable(r.buf, chunkLen, false, chunkType) {
+			return 0, r.err
+		}
+	}
+	return 0, r.err
+}
+
+// Skip will skip n bytes forward in the decompressed output.
+// For larger skips this consumes less CPU and is faster than reading output and discarding it.
+// CRC is not checked on skipped blocks.
+// io.ErrUnexpectedEOF is returned if the stream ends before all bytes have been skipped.
+// If a decoding error is encountered subsequent calls to Read will also fail.
+func (r *Reader) Skip(n int64) error {
+	if n < 0 {
+		return errors.New("attempted negative skip")
+	}
+	if r.err != nil {
+		return r.err
+	}
+
+	for n > 0 {
+		if r.i < r.j {
+			// Skip in buffer.
+			// decoded[i:j] contains decoded bytes that have not yet been passed on.
+			left := int64(r.j - r.i)
+			if left >= n {
+				tmp := int64(r.i) + n
+				if tmp > math.MaxInt32 {
+					return errors.New("s2: internal overflow in skip")
+				}
+				r.i = int(tmp)
+				return nil
+			}
+			n -= int64(r.j - r.i)
+			r.i = r.j
+		}
+
+		// Buffer empty; read blocks until we have content.
+		if !r.readFull(r.buf[:4], true) {
+			if r.err == io.EOF {
+				r.err = io.ErrUnexpectedEOF
+			}
+			return r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrUnsupported
+				}
+				return r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			dLen, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return r.err
+			}
+			if dLen > r.maxBlock {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			// Check if destination is within this block
+			if int64(dLen) > n {
+				if len(r.decoded) < dLen {
+					r.decoded = make([]byte, dLen)
+				}
+				if _, err := Decode(r.decoded, buf); err != nil {
+					r.err = err
+					return r.err
+				}
+				if crc(r.decoded[:dLen]) != checksum {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			} else {
+				// Skip block completely
+				n -= int64(dLen)
+				r.blockStart += int64(dLen)
+				dLen = 0
+			}
+			r.i, r.j = 0, dLen
+			continue
+		case chunkTypeUncompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err != nil {
+					r.err = ErrUnsupported
+				}
+				return r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n2 := chunkLen - checksumSize
+			if n2 > len(r.decoded) {
+				if n2 > r.maxBlock {
+					r.err = ErrCorrupt
+					return r.err
+				}
+				r.decoded = make([]byte, n2)
+			}
+			if !r.readFull(r.decoded[:n2], false) {
+				return r.err
+			}
+			if int64(n2) < n {
+				if crc(r.decoded[:n2]) != checksum {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			}
+			r.i, r.j = 0, n2
+			continue
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return r.err
+			}
+			if string(r.buf[:len(magicBody)]) != magicBody {
+				if string(r.buf[:len(magicBody)]) != magicBodySnappy {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			}
+
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			r.err = ErrUnsupported
+			return r.err
+		}
+		if chunkLen > maxChunkSize {
+			r.err = ErrUnsupported
+			return r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if !r.skippable(r.buf, chunkLen, false, chunkType) {
+			return r.err
+		}
+	}
+	return nil
+}
+
+// ReadSeeker provides random or forward seeking in compressed content.
+// See Reader.ReadSeeker
+type ReadSeeker struct {
+	*Reader
+	readAtMu sync.Mutex
+}
+
+// ReadSeeker will return an io.ReadSeeker and io.ReaderAt
+// compatible version of the reader.
+// If 'random' is specified the returned io.Seeker can be used for
+// random seeking, otherwise only forward seeking is supported.
+// Enabling random seeking requires the original input to support
+// the io.Seeker interface.
+// A custom index can be specified which will be used if supplied.
+// When using a custom index, it will not be read from the input stream.
+// The ReadAt position will affect regular reads and the current position of Seek.
+// So using Read after ReadAt will continue from where the ReadAt stopped.
+// No functions should be used concurrently.
+// The returned ReadSeeker contains a shallow reference to the existing Reader,
+// meaning changes performed to one is reflected in the other.
+func (r *Reader) ReadSeeker(random bool, index []byte) (*ReadSeeker, error) {
+	// Read index if provided.
+	if len(index) != 0 {
+		if r.index == nil {
+			r.index = &Index{}
+		}
+		if _, err := r.index.Load(index); err != nil {
+			return nil, ErrCantSeek{Reason: "loading index returned: " + err.Error()}
+		}
+	}
+
+	// Check if input is seekable
+	rs, ok := r.r.(io.ReadSeeker)
+	if !ok {
+		if !random {
+			return &ReadSeeker{Reader: r}, nil
+		}
+		return nil, ErrCantSeek{Reason: "input stream isn't seekable"}
+	}
+
+	if r.index != nil {
+		// Seekable and index, ok...
+		return &ReadSeeker{Reader: r}, nil
+	}
+
+	// Load from stream.
+	r.index = &Index{}
+
+	// Read current position.
+	pos, err := rs.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return nil, ErrCantSeek{Reason: "seeking input returned: " + err.Error()}
+	}
+	err = r.index.LoadStream(rs)
+	if err != nil {
+		if err == ErrUnsupported {
+			// If we don't require random seeking, reset input and return.
+			if !random {
+				_, err = rs.Seek(pos, io.SeekStart)
+				if err != nil {
+					return nil, ErrCantSeek{Reason: "resetting stream returned: " + err.Error()}
+				}
+				r.index = nil
+				return &ReadSeeker{Reader: r}, nil
+			}
+			return nil, ErrCantSeek{Reason: "input stream does not contain an index"}
+		}
+		return nil, ErrCantSeek{Reason: "reading index returned: " + err.Error()}
+	}
+
+	// reset position.
+	_, err = rs.Seek(pos, io.SeekStart)
+	if err != nil {
+		return nil, ErrCantSeek{Reason: "seeking input returned: " + err.Error()}
+	}
+	return &ReadSeeker{Reader: r}, nil
+}
+
+// Seek allows seeking in compressed data.
+func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {
+	if r.err != nil {
+		if !errors.Is(r.err, io.EOF) {
+			return 0, r.err
+		}
+		// Reset on EOF
+		r.err = nil
+	}
+
+	// Calculate absolute offset.
+	absOffset := offset
+
+	switch whence {
+	case io.SeekStart:
+	case io.SeekCurrent:
+		absOffset = r.blockStart + int64(r.i) + offset
+	case io.SeekEnd:
+		if r.index == nil {
+			return 0, ErrUnsupported
+		}
+		absOffset = r.index.TotalUncompressed + offset
+	default:
+		r.err = ErrUnsupported
+		return 0, r.err
+	}
+
+	if absOffset < 0 {
+		return 0, errors.New("seek before start of file")
+	}
+
+	if !r.readHeader {
+		// Make sure we read the header.
+		_, r.err = r.Read([]byte{})
+		if r.err != nil {
+			return 0, r.err
+		}
+	}
+
+	// If we are inside current block no need to seek.
+	// This includes no offset changes.
+	if absOffset >= r.blockStart && absOffset < r.blockStart+int64(r.j) {
+		r.i = int(absOffset - r.blockStart)
+		return r.blockStart + int64(r.i), nil
+	}
+
+	rs, ok := r.r.(io.ReadSeeker)
+	if r.index == nil || !ok {
+		currOffset := r.blockStart + int64(r.i)
+		if absOffset >= currOffset {
+			err := r.Skip(absOffset - currOffset)
+			return r.blockStart + int64(r.i), err
+		}
+		return 0, ErrUnsupported
+	}
+
+	// We can seek and we have an index.
+	c, u, err := r.index.Find(absOffset)
+	if err != nil {
+		return r.blockStart + int64(r.i), err
+	}
+
+	// Seek to next block
+	_, err = rs.Seek(c, io.SeekStart)
+	if err != nil {
+		return 0, err
+	}
+
+	r.i = r.j                     // Remove rest of current block.
+	r.blockStart = u - int64(r.j) // Adjust current block start for accounting.
+	if u < absOffset {
+		// Forward inside block
+		return absOffset, r.Skip(absOffset - u)
+	}
+	if u > absOffset {
+		return 0, fmt.Errorf("s2 seek: (internal error) u (%d) > absOffset (%d)", u, absOffset)
+	}
+	return absOffset, nil
+}
+
+// ReadAt reads len(p) bytes into p starting at offset off in the
+// underlying input source. It returns the number of bytes
+// read (0 <= n <= len(p)) and any error encountered.
+//
+// When ReadAt returns n < len(p), it returns a non-nil error
+// explaining why more bytes were not returned. In this respect,
+// ReadAt is stricter than Read.
+//
+// Even if ReadAt returns n < len(p), it may use all of p as scratch
+// space during the call. If some data is available but not len(p) bytes,
+// ReadAt blocks until either all the data is available or an error occurs.
+// In this respect ReadAt is different from Read.
+//
+// If the n = len(p) bytes returned by ReadAt are at the end of the
+// input source, ReadAt may return either err == EOF or err == nil.
+//
+// If ReadAt is reading from an input source with a seek offset,
+// ReadAt should not affect nor be affected by the underlying
+// seek offset.
+//
+// Clients of ReadAt can execute parallel ReadAt calls on the
+// same input source. This is however not recommended.
+func (r *ReadSeeker) ReadAt(p []byte, offset int64) (int, error) {
+	r.readAtMu.Lock()
+	defer r.readAtMu.Unlock()
+	_, err := r.Seek(offset, io.SeekStart)
+	if err != nil {
+		return 0, err
+	}
+	n := 0
+	for n < len(p) {
+		n2, err := r.Read(p[n:])
+		if err != nil {
+			// This will include io.EOF
+			return n + n2, err
+		}
+		n += n2
+	}
+	return n, nil
+}
+
+// ReadByte satisfies the io.ByteReader interface.
+func (r *Reader) ReadByte() (byte, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	if r.i < r.j {
+		c := r.decoded[r.i]
+		r.i++
+		return c, nil
+	}
+	var tmp [1]byte
+	for i := 0; i < 10; i++ {
+		n, err := r.Read(tmp[:])
+		if err != nil {
+			return 0, err
+		}
+		if n == 1 {
+			return tmp[0], nil
+		}
+	}
+	return 0, io.ErrNoProgress
+}
+
+// SkippableCB will register a callback for chunks with the specified ID.
+// ID must be a Reserved skippable chunks ID, 0x80-0xfd (inclusive).
+// For each chunk with the ID, the callback is called with the content.
+// Any returned non-nil error will abort decompression.
+// Only one callback per ID is supported, latest sent will be used.
+// Sending a nil function will disable previous callbacks.
+// You can peek the stream, triggering the callback, by doing a Read with a 0
+// byte buffer.
+func (r *Reader) SkippableCB(id uint8, fn func(r io.Reader) error) error {
+	if id < 0x80 || id >= chunkTypePadding {
+		return fmt.Errorf("ReaderSkippableCB: Invalid id provided, must be 0x80-0xfe (inclusive)")
+	}
+	r.skippableCB[id-0x80] = fn
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/s2/s2.go b/vendor/github.com/klauspost/compress/s2/s2.go
new file mode 100644
index 0000000000..cbd1ed64d6
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/s2.go
@@ -0,0 +1,151 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package s2 implements the S2 compression format.
+//
+// S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput,
+// which is why it features concurrent compression for bigger payloads.
+//
+// Decoding is compatible with Snappy compressed content,
+// but content compressed with S2 cannot be decompressed by Snappy.
+//
+// For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2
+//
+// There are actually two S2 formats: block and stream. They are related,
+// but different: trying to decompress block-compressed data as a S2 stream
+// will fail, and vice versa. The block format is the Decode and Encode
+// functions and the stream format is the Reader and Writer types.
+//
+// A "better" compression option is available. This will trade some compression
+// speed
+//
+// The block format, the more common case, is used when the complete size (the
+// number of bytes) of the original data is known upfront, at the time
+// compression starts. The stream format, also known as the framing format, is
+// for when that isn't always true.
+//
+// Blocks to not offer much data protection, so it is up to you to
+// add data validation of decompressed blocks.
+//
+// Streams perform CRC validation of the decompressed data.
+// Stream compression will also be performed on multiple CPU cores concurrently
+// significantly improving throughput.
+package s2
+
+import (
+	"bytes"
+	"hash/crc32"
+
+	"github.com/klauspost/compress/internal/race"
+)
+
+/*
+Each encoded block begins with the varint-encoded length of the decoded data,
+followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
+first byte of each chunk is broken into its 2 least and 6 most significant bits
+called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
+Zero means a literal tag. All other values mean a copy tag.
+
+For literal tags:
+  - If m < 60, the next 1 + m bytes are literal bytes.
+  - Otherwise, let n be the little-endian unsigned integer denoted by the next
+    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
+
+For copy tags, length bytes are copied from offset bytes ago, in the style of
+Lempel-Ziv compression algorithms. In particular:
+  - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
+    The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
+    of the offset. The next byte is bits 0-7 of the offset.
+  - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
+    The length is 1 + m. The offset is the little-endian unsigned integer
+    denoted by the next 2 bytes.
+  - For l == 3, the offset ranges in [0, 1<<32) and the length in
+    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
+    integer denoted by the next 4 bytes.
+*/
+const (
+	tagLiteral = 0x00
+	tagCopy1   = 0x01
+	tagCopy2   = 0x02
+	tagCopy4   = 0x03
+)
+
+const (
+	checksumSize     = 4
+	chunkHeaderSize  = 4
+	magicChunk       = "\xff\x06\x00\x00" + magicBody
+	magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy
+	magicBodySnappy  = "sNaPpY"
+	magicBody        = "S2sTwO"
+
+	// maxBlockSize is the maximum size of the input to encodeBlock.
+	//
+	// For the framing format (Writer type instead of Encode function),
+	// this is the maximum uncompressed size of a block.
+	maxBlockSize = 4 << 20
+
+	// minBlockSize is the minimum size of block setting when creating a writer.
+	minBlockSize = 4 << 10
+
+	skippableFrameHeader = 4
+	maxChunkSize         = 1<<24 - 1 // 16777215
+
+	// Default block size
+	defaultBlockSize = 1 << 20
+
+	// maxSnappyBlockSize is the maximum snappy block size.
+	maxSnappyBlockSize = 1 << 16
+
+	obufHeaderLen = checksumSize + chunkHeaderSize
+)
+
+const (
+	chunkTypeCompressedData   = 0x00
+	chunkTypeUncompressedData = 0x01
+	ChunkTypeIndex            = 0x99
+	chunkTypePadding          = 0xfe
+	chunkTypeStreamIdentifier = 0xff
+)
+
+var (
+	crcTable              = crc32.MakeTable(crc32.Castagnoli)
+	magicChunkSnappyBytes = []byte(magicChunkSnappy) // Can be passed to functions where it escapes.
+	magicChunkBytes       = []byte(magicChunk)       // Can be passed to functions where it escapes.
+)
+
+// crc implements the checksum specified in section 3 of
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func crc(b []byte) uint32 {
+	race.ReadSlice(b)
+
+	c := crc32.Update(0, crcTable, b)
+	return c>>15 | c<<17 + 0xa282ead8
+}
+
+// literalExtraSize returns the extra size of encoding n literals.
+// n should be >= 0 and <= math.MaxUint32.
+func literalExtraSize(n int64) int64 {
+	if n == 0 {
+		return 0
+	}
+	switch {
+	case n < 60:
+		return 1
+	case n < 1<<8:
+		return 2
+	case n < 1<<16:
+		return 3
+	case n < 1<<24:
+		return 4
+	default:
+		return 5
+	}
+}
+
+type byter interface {
+	Bytes() []byte
+}
+
+var _ byter = &bytes.Buffer{}
diff --git a/vendor/github.com/klauspost/compress/s2/writer.go b/vendor/github.com/klauspost/compress/s2/writer.go
new file mode 100644
index 0000000000..fd15078f7d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/writer.go
@@ -0,0 +1,1064 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Copyright (c) 2019+ Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package s2
+
+import (
+	"crypto/rand"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"runtime"
+	"sync"
+
+	"github.com/klauspost/compress/internal/race"
+)
+
+const (
+	levelUncompressed = iota + 1
+	levelFast
+	levelBetter
+	levelBest
+)
+
+// NewWriter returns a new Writer that compresses to w, using the
+// framing format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+//
+// Users must call Close to guarantee all data has been forwarded to
+// the underlying io.Writer and that resources are released.
+// They may also call Flush zero or more times before calling Close.
+func NewWriter(w io.Writer, opts ...WriterOption) *Writer {
+	w2 := Writer{
+		blockSize:   defaultBlockSize,
+		concurrency: runtime.GOMAXPROCS(0),
+		randSrc:     rand.Reader,
+		level:       levelFast,
+	}
+	for _, opt := range opts {
+		if err := opt(&w2); err != nil {
+			w2.errState = err
+			return &w2
+		}
+	}
+	w2.obufLen = obufHeaderLen + MaxEncodedLen(w2.blockSize)
+	w2.paramsOK = true
+	w2.ibuf = make([]byte, 0, w2.blockSize)
+	w2.buffers.New = func() interface{} {
+		return make([]byte, w2.obufLen)
+	}
+	w2.Reset(w)
+	return &w2
+}
+
+// Writer is an io.Writer that can write Snappy-compressed bytes.
+type Writer struct {
+	errMu    sync.Mutex
+	errState error
+
+	// ibuf is a buffer for the incoming (uncompressed) bytes.
+	ibuf []byte
+
+	blockSize     int
+	obufLen       int
+	concurrency   int
+	written       int64
+	uncompWritten int64 // Bytes sent to compression
+	output        chan chan result
+	buffers       sync.Pool
+	pad           int
+
+	writer    io.Writer
+	randSrc   io.Reader
+	writerWg  sync.WaitGroup
+	index     Index
+	customEnc func(dst, src []byte) int
+
+	// wroteStreamHeader is whether we have written the stream header.
+	wroteStreamHeader bool
+	paramsOK          bool
+	snappy            bool
+	flushOnWrite      bool
+	appendIndex       bool
+	bufferCB          func([]byte)
+	level             uint8
+}
+
+type result struct {
+	b []byte
+	// return when writing
+	ret []byte
+	// Uncompressed start offset
+	startOffset int64
+}
+
+// err returns the previously set error.
+// If no error has been set it is set to err if not nil.
+func (w *Writer) err(err error) error {
+	w.errMu.Lock()
+	errSet := w.errState
+	if errSet == nil && err != nil {
+		w.errState = err
+		errSet = err
+	}
+	w.errMu.Unlock()
+	return errSet
+}
+
+// Reset discards the writer's state and switches the Snappy writer to write to w.
+// This permits reusing a Writer rather than allocating a new one.
+func (w *Writer) Reset(writer io.Writer) {
+	if !w.paramsOK {
+		return
+	}
+	// Close previous writer, if any.
+	if w.output != nil {
+		close(w.output)
+		w.writerWg.Wait()
+		w.output = nil
+	}
+	w.errState = nil
+	w.ibuf = w.ibuf[:0]
+	w.wroteStreamHeader = false
+	w.written = 0
+	w.writer = writer
+	w.uncompWritten = 0
+	w.index.reset(w.blockSize)
+
+	// If we didn't get a writer, stop here.
+	if writer == nil {
+		return
+	}
+	// If no concurrency requested, don't spin up writer goroutine.
+	if w.concurrency == 1 {
+		return
+	}
+
+	toWrite := make(chan chan result, w.concurrency)
+	w.output = toWrite
+	w.writerWg.Add(1)
+
+	// Start a writer goroutine that will write all output in order.
+	go func() {
+		defer w.writerWg.Done()
+
+		// Get a queued write.
+		for write := range toWrite {
+			// Wait for the data to be available.
+			input := <-write
+			if input.ret != nil && w.bufferCB != nil {
+				w.bufferCB(input.ret)
+				input.ret = nil
+			}
+			in := input.b
+			if len(in) > 0 {
+				if w.err(nil) == nil {
+					// Don't expose data from previous buffers.
+					toWrite := in[:len(in):len(in)]
+					// Write to output.
+					n, err := writer.Write(toWrite)
+					if err == nil && n != len(toWrite) {
+						err = io.ErrShortBuffer
+					}
+					_ = w.err(err)
+					w.err(w.index.add(w.written, input.startOffset))
+					w.written += int64(n)
+				}
+			}
+			if cap(in) >= w.obufLen {
+				w.buffers.Put(in)
+			}
+			// close the incoming write request.
+			// This can be used for synchronizing flushes.
+			close(write)
+		}
+	}()
+}
+
+// Write satisfies the io.Writer interface.
+func (w *Writer) Write(p []byte) (nRet int, errRet error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if w.flushOnWrite {
+		return w.write(p)
+	}
+	// If we exceed the input buffer size, start writing
+	for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err(nil) == nil {
+		var n int
+		if len(w.ibuf) == 0 {
+			// Large write, empty buffer.
+			// Write directly from p to avoid copy.
+			n, _ = w.write(p)
+		} else {
+			n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+			w.ibuf = w.ibuf[:len(w.ibuf)+n]
+			w.write(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+		}
+		nRet += n
+		p = p[n:]
+	}
+	if err := w.err(nil); err != nil {
+		return nRet, err
+	}
+	// p should always be able to fit into w.ibuf now.
+	n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+	w.ibuf = w.ibuf[:len(w.ibuf)+n]
+	nRet += n
+	return nRet, nil
+}
+
+// ReadFrom implements the io.ReaderFrom interface.
+// Using this is typically more efficient since it avoids a memory copy.
+// ReadFrom reads data from r until EOF or error.
+// The return value n is the number of bytes read.
+// Any error except io.EOF encountered during the read is also returned.
+func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if len(w.ibuf) > 0 {
+		err := w.AsyncFlush()
+		if err != nil {
+			return 0, err
+		}
+	}
+	if br, ok := r.(byter); ok {
+		buf := br.Bytes()
+		if err := w.EncodeBuffer(buf); err != nil {
+			return 0, err
+		}
+		return int64(len(buf)), w.AsyncFlush()
+	}
+	for {
+		inbuf := w.buffers.Get().([]byte)[:w.blockSize+obufHeaderLen]
+		n2, err := io.ReadFull(r, inbuf[obufHeaderLen:])
+		if err != nil {
+			if err == io.ErrUnexpectedEOF {
+				err = io.EOF
+			}
+			if err != io.EOF {
+				return n, w.err(err)
+			}
+		}
+		if n2 == 0 {
+			if cap(inbuf) >= w.obufLen {
+				w.buffers.Put(inbuf)
+			}
+			break
+		}
+		n += int64(n2)
+		err2 := w.writeFull(inbuf[:n2+obufHeaderLen])
+		if w.err(err2) != nil {
+			break
+		}
+
+		if err != nil {
+			// We got EOF and wrote everything
+			break
+		}
+	}
+
+	return n, w.err(nil)
+}
+
+// AddSkippableBlock will add a skippable block to the stream.
+// The ID must be 0x80-0xfe (inclusive).
+// Length of the skippable block must be <= 16777215 bytes.
+func (w *Writer) AddSkippableBlock(id uint8, data []byte) (err error) {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+	if len(data) == 0 {
+		return nil
+	}
+	if id < 0x80 || id > chunkTypePadding {
+		return fmt.Errorf("invalid skippable block id %x", id)
+	}
+	if len(data) > maxChunkSize {
+		return fmt.Errorf("skippable block excessed maximum size")
+	}
+	var header [4]byte
+	chunkLen := len(data)
+	header[0] = id
+	header[1] = uint8(chunkLen >> 0)
+	header[2] = uint8(chunkLen >> 8)
+	header[3] = uint8(chunkLen >> 16)
+	if w.concurrency == 1 {
+		write := func(b []byte) error {
+			n, err := w.writer.Write(b)
+			if err = w.err(err); err != nil {
+				return err
+			}
+			if n != len(b) {
+				return w.err(io.ErrShortWrite)
+			}
+			w.written += int64(n)
+			return w.err(nil)
+		}
+		if !w.wroteStreamHeader {
+			w.wroteStreamHeader = true
+			if w.snappy {
+				if err := write([]byte(magicChunkSnappy)); err != nil {
+					return err
+				}
+			} else {
+				if err := write([]byte(magicChunk)); err != nil {
+					return err
+				}
+			}
+		}
+		if err := write(header[:]); err != nil {
+			return err
+		}
+		return write(data)
+	}
+
+	// Create output...
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		hWriter := make(chan result)
+		w.output <- hWriter
+		if w.snappy {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
+		} else {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
+		}
+	}
+
+	// Copy input.
+	inbuf := w.buffers.Get().([]byte)[:4]
+	copy(inbuf, header[:])
+	inbuf = append(inbuf, data...)
+
+	output := make(chan result, 1)
+	// Queue output.
+	w.output <- output
+	output <- result{startOffset: w.uncompWritten, b: inbuf}
+
+	return nil
+}
+
+// EncodeBuffer will add a buffer to the stream.
+// This is the fastest way to encode a stream,
+// but the input buffer cannot be written to by the caller
+// until Flush or Close has been called when concurrency != 1.
+//
+// Use the WriterBufferDone to receive a callback when the buffer is done
+// Processing.
+//
+// Note that input is not buffered.
+// This means that each write will result in discrete blocks being created.
+// For buffered writes, use the regular Write function.
+func (w *Writer) EncodeBuffer(buf []byte) (err error) {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	if w.flushOnWrite {
+		_, err := w.write(buf)
+		return err
+	}
+	// Flush queued data first.
+	if len(w.ibuf) > 0 {
+		err := w.AsyncFlush()
+		if err != nil {
+			return err
+		}
+	}
+	if w.concurrency == 1 {
+		_, err := w.writeSync(buf)
+		if w.bufferCB != nil {
+			w.bufferCB(buf)
+		}
+		return err
+	}
+
+	// Spawn goroutine and write block to output channel.
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		hWriter := make(chan result)
+		w.output <- hWriter
+		if w.snappy {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
+		} else {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
+		}
+	}
+	orgBuf := buf
+	for len(buf) > 0 {
+		// Cut input.
+		uncompressed := buf
+		if len(uncompressed) > w.blockSize {
+			uncompressed = uncompressed[:w.blockSize]
+		}
+		buf = buf[len(uncompressed):]
+		// Get an output buffer.
+		obuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
+		race.WriteSlice(obuf)
+
+		output := make(chan result)
+		// Queue output now, so we keep order.
+		w.output <- output
+		res := result{
+			startOffset: w.uncompWritten,
+		}
+		w.uncompWritten += int64(len(uncompressed))
+		if len(buf) == 0 && w.bufferCB != nil {
+			res.ret = orgBuf
+		}
+		go func() {
+			race.ReadSlice(uncompressed)
+
+			checksum := crc(uncompressed)
+
+			// Set to uncompressed.
+			chunkType := uint8(chunkTypeUncompressedData)
+			chunkLen := 4 + len(uncompressed)
+
+			// Attempt compressing.
+			n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+			n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+			// Check if we should use this, or store as uncompressed instead.
+			if n2 > 0 {
+				chunkType = uint8(chunkTypeCompressedData)
+				chunkLen = 4 + n + n2
+				obuf = obuf[:obufHeaderLen+n+n2]
+			} else {
+				// copy uncompressed
+				copy(obuf[obufHeaderLen:], uncompressed)
+			}
+
+			// Fill in the per-chunk header that comes before the body.
+			obuf[0] = chunkType
+			obuf[1] = uint8(chunkLen >> 0)
+			obuf[2] = uint8(chunkLen >> 8)
+			obuf[3] = uint8(chunkLen >> 16)
+			obuf[4] = uint8(checksum >> 0)
+			obuf[5] = uint8(checksum >> 8)
+			obuf[6] = uint8(checksum >> 16)
+			obuf[7] = uint8(checksum >> 24)
+
+			// Queue final output.
+			res.b = obuf
+			output <- res
+		}()
+	}
+	return nil
+}
+
+func (w *Writer) encodeBlock(obuf, uncompressed []byte) int {
+	if w.customEnc != nil {
+		if ret := w.customEnc(obuf, uncompressed); ret >= 0 {
+			return ret
+		}
+	}
+	if w.snappy {
+		switch w.level {
+		case levelFast:
+			return encodeBlockSnappy(obuf, uncompressed)
+		case levelBetter:
+			return encodeBlockBetterSnappy(obuf, uncompressed)
+		case levelBest:
+			return encodeBlockBestSnappy(obuf, uncompressed)
+		}
+		return 0
+	}
+	switch w.level {
+	case levelFast:
+		return encodeBlock(obuf, uncompressed)
+	case levelBetter:
+		return encodeBlockBetter(obuf, uncompressed)
+	case levelBest:
+		return encodeBlockBest(obuf, uncompressed, nil)
+	}
+	return 0
+}
+
+func (w *Writer) write(p []byte) (nRet int, errRet error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if w.concurrency == 1 {
+		return w.writeSync(p)
+	}
+
+	// Spawn goroutine and write block to output channel.
+	for len(p) > 0 {
+		if !w.wroteStreamHeader {
+			w.wroteStreamHeader = true
+			hWriter := make(chan result)
+			w.output <- hWriter
+			if w.snappy {
+				hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
+			} else {
+				hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
+			}
+		}
+
+		var uncompressed []byte
+		if len(p) > w.blockSize {
+			uncompressed, p = p[:w.blockSize], p[w.blockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+
+		// Copy input.
+		// If the block is incompressible, this is used for the result.
+		inbuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
+		obuf := w.buffers.Get().([]byte)[:w.obufLen]
+		copy(inbuf[obufHeaderLen:], uncompressed)
+		uncompressed = inbuf[obufHeaderLen:]
+
+		output := make(chan result)
+		// Queue output now, so we keep order.
+		w.output <- output
+		res := result{
+			startOffset: w.uncompWritten,
+		}
+		w.uncompWritten += int64(len(uncompressed))
+
+		go func() {
+			checksum := crc(uncompressed)
+
+			// Set to uncompressed.
+			chunkType := uint8(chunkTypeUncompressedData)
+			chunkLen := 4 + len(uncompressed)
+
+			// Attempt compressing.
+			n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+			n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+			// Check if we should use this, or store as uncompressed instead.
+			if n2 > 0 {
+				chunkType = uint8(chunkTypeCompressedData)
+				chunkLen = 4 + n + n2
+				obuf = obuf[:obufHeaderLen+n+n2]
+			} else {
+				// Use input as output.
+				obuf, inbuf = inbuf, obuf
+			}
+
+			// Fill in the per-chunk header that comes before the body.
+			obuf[0] = chunkType
+			obuf[1] = uint8(chunkLen >> 0)
+			obuf[2] = uint8(chunkLen >> 8)
+			obuf[3] = uint8(chunkLen >> 16)
+			obuf[4] = uint8(checksum >> 0)
+			obuf[5] = uint8(checksum >> 8)
+			obuf[6] = uint8(checksum >> 16)
+			obuf[7] = uint8(checksum >> 24)
+
+			// Queue final output.
+			res.b = obuf
+			output <- res
+
+			// Put unused buffer back in pool.
+			w.buffers.Put(inbuf)
+		}()
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// writeFull is a special version of write that will always write the full buffer.
+// Data to be compressed should start at offset obufHeaderLen and fill the remainder of the buffer.
+// The data will be written as a single block.
+// The caller is not allowed to use inbuf after this function has been called.
+func (w *Writer) writeFull(inbuf []byte) (errRet error) {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	if w.concurrency == 1 {
+		_, err := w.writeSync(inbuf[obufHeaderLen:])
+		if cap(inbuf) >= w.obufLen {
+			w.buffers.Put(inbuf)
+		}
+		return err
+	}
+
+	// Spawn goroutine and write block to output channel.
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		hWriter := make(chan result)
+		w.output <- hWriter
+		if w.snappy {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
+		} else {
+			hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
+		}
+	}
+
+	// Get an output buffer.
+	obuf := w.buffers.Get().([]byte)[:w.obufLen]
+	uncompressed := inbuf[obufHeaderLen:]
+
+	output := make(chan result)
+	// Queue output now, so we keep order.
+	w.output <- output
+	res := result{
+		startOffset: w.uncompWritten,
+	}
+	w.uncompWritten += int64(len(uncompressed))
+
+	go func() {
+		checksum := crc(uncompressed)
+
+		// Set to uncompressed.
+		chunkType := uint8(chunkTypeUncompressedData)
+		chunkLen := 4 + len(uncompressed)
+
+		// Attempt compressing.
+		n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+		n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+		// Check if we should use this, or store as uncompressed instead.
+		if n2 > 0 {
+			chunkType = uint8(chunkTypeCompressedData)
+			chunkLen = 4 + n + n2
+			obuf = obuf[:obufHeaderLen+n+n2]
+		} else {
+			// Use input as output.
+			obuf, inbuf = inbuf, obuf
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		obuf[0] = chunkType
+		obuf[1] = uint8(chunkLen >> 0)
+		obuf[2] = uint8(chunkLen >> 8)
+		obuf[3] = uint8(chunkLen >> 16)
+		obuf[4] = uint8(checksum >> 0)
+		obuf[5] = uint8(checksum >> 8)
+		obuf[6] = uint8(checksum >> 16)
+		obuf[7] = uint8(checksum >> 24)
+
+		// Queue final output.
+		res.b = obuf
+		output <- res
+
+		// Put unused buffer back in pool.
+		w.buffers.Put(inbuf)
+	}()
+	return nil
+}
+
+func (w *Writer) writeSync(p []byte) (nRet int, errRet error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		var n int
+		var err error
+		if w.snappy {
+			n, err = w.writer.Write(magicChunkSnappyBytes)
+		} else {
+			n, err = w.writer.Write(magicChunkBytes)
+		}
+		if err != nil {
+			return 0, w.err(err)
+		}
+		if n != len(magicChunk) {
+			return 0, w.err(io.ErrShortWrite)
+		}
+		w.written += int64(n)
+	}
+
+	for len(p) > 0 {
+		var uncompressed []byte
+		if len(p) > w.blockSize {
+			uncompressed, p = p[:w.blockSize], p[w.blockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+
+		obuf := w.buffers.Get().([]byte)[:w.obufLen]
+		checksum := crc(uncompressed)
+
+		// Set to uncompressed.
+		chunkType := uint8(chunkTypeUncompressedData)
+		chunkLen := 4 + len(uncompressed)
+
+		// Attempt compressing.
+		n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+		n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+		if n2 > 0 {
+			chunkType = uint8(chunkTypeCompressedData)
+			chunkLen = 4 + n + n2
+			obuf = obuf[:obufHeaderLen+n+n2]
+		} else {
+			obuf = obuf[:8]
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		obuf[0] = chunkType
+		obuf[1] = uint8(chunkLen >> 0)
+		obuf[2] = uint8(chunkLen >> 8)
+		obuf[3] = uint8(chunkLen >> 16)
+		obuf[4] = uint8(checksum >> 0)
+		obuf[5] = uint8(checksum >> 8)
+		obuf[6] = uint8(checksum >> 16)
+		obuf[7] = uint8(checksum >> 24)
+
+		n, err := w.writer.Write(obuf)
+		if err != nil {
+			return 0, w.err(err)
+		}
+		if n != len(obuf) {
+			return 0, w.err(io.ErrShortWrite)
+		}
+		w.err(w.index.add(w.written, w.uncompWritten))
+		w.written += int64(n)
+		w.uncompWritten += int64(len(uncompressed))
+
+		if chunkType == chunkTypeUncompressedData {
+			// Write uncompressed data.
+			n, err := w.writer.Write(uncompressed)
+			if err != nil {
+				return 0, w.err(err)
+			}
+			if n != len(uncompressed) {
+				return 0, w.err(io.ErrShortWrite)
+			}
+			w.written += int64(n)
+		}
+		w.buffers.Put(obuf)
+		// Queue final output.
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// AsyncFlush writes any buffered bytes to a block and starts compressing it.
+// It does not wait for the output has been written as Flush() does.
+func (w *Writer) AsyncFlush() error {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	// Queue any data still in input buffer.
+	if len(w.ibuf) != 0 {
+		if !w.wroteStreamHeader {
+			_, err := w.writeSync(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+			return w.err(err)
+		} else {
+			_, err := w.write(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+			err = w.err(err)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	return w.err(nil)
+}
+
+// Flush flushes the Writer to its underlying io.Writer.
+// This does not apply padding.
+func (w *Writer) Flush() error {
+	if err := w.AsyncFlush(); err != nil {
+		return err
+	}
+	if w.output == nil {
+		return w.err(nil)
+	}
+
+	// Send empty buffer
+	res := make(chan result)
+	w.output <- res
+	// Block until this has been picked up.
+	res <- result{b: nil, startOffset: w.uncompWritten}
+	// When it is closed, we have flushed.
+	<-res
+	return w.err(nil)
+}
+
+// Close calls Flush and then closes the Writer.
+// Calling Close multiple times is ok,
+// but calling CloseIndex after this will make it not return the index.
+func (w *Writer) Close() error {
+	_, err := w.closeIndex(w.appendIndex)
+	return err
+}
+
+// CloseIndex calls Close and returns an index on first call.
+// This is not required if you are only adding index to a stream.
+func (w *Writer) CloseIndex() ([]byte, error) {
+	return w.closeIndex(true)
+}
+
+func (w *Writer) closeIndex(idx bool) ([]byte, error) {
+	err := w.Flush()
+	if w.output != nil {
+		close(w.output)
+		w.writerWg.Wait()
+		w.output = nil
+	}
+
+	var index []byte
+	if w.err(err) == nil && w.writer != nil {
+		// Create index.
+		if idx {
+			compSize := int64(-1)
+			if w.pad <= 1 {
+				compSize = w.written
+			}
+			index = w.index.appendTo(w.ibuf[:0], w.uncompWritten, compSize)
+			// Count as written for padding.
+			if w.appendIndex {
+				w.written += int64(len(index))
+			}
+		}
+
+		if w.pad > 1 {
+			tmp := w.ibuf[:0]
+			if len(index) > 0 {
+				// Allocate another buffer.
+				tmp = w.buffers.Get().([]byte)[:0]
+				defer w.buffers.Put(tmp)
+			}
+			add := calcSkippableFrame(w.written, int64(w.pad))
+			frame, err := skippableFrame(tmp, add, w.randSrc)
+			if err = w.err(err); err != nil {
+				return nil, err
+			}
+			n, err2 := w.writer.Write(frame)
+			if err2 == nil && n != len(frame) {
+				err2 = io.ErrShortWrite
+			}
+			_ = w.err(err2)
+		}
+		if len(index) > 0 && w.appendIndex {
+			n, err2 := w.writer.Write(index)
+			if err2 == nil && n != len(index) {
+				err2 = io.ErrShortWrite
+			}
+			_ = w.err(err2)
+		}
+	}
+	err = w.err(errClosed)
+	if err == errClosed {
+		return index, nil
+	}
+	return nil, err
+}
+
+// calcSkippableFrame will return a total size to be added for written
+// to be divisible by multiple.
+// The value will always be > skippableFrameHeader.
+// The function will panic if written < 0 or wantMultiple <= 0.
+func calcSkippableFrame(written, wantMultiple int64) int {
+	if wantMultiple <= 0 {
+		panic("wantMultiple <= 0")
+	}
+	if written < 0 {
+		panic("written < 0")
+	}
+	leftOver := written % wantMultiple
+	if leftOver == 0 {
+		return 0
+	}
+	toAdd := wantMultiple - leftOver
+	for toAdd < skippableFrameHeader {
+		toAdd += wantMultiple
+	}
+	return int(toAdd)
+}
+
+// skippableFrame will add a skippable frame with a total size of bytes.
+// total should be >= skippableFrameHeader and < maxBlockSize + skippableFrameHeader
+func skippableFrame(dst []byte, total int, r io.Reader) ([]byte, error) {
+	if total == 0 {
+		return dst, nil
+	}
+	if total < skippableFrameHeader {
+		return dst, fmt.Errorf("s2: requested skippable frame (%d) < 4", total)
+	}
+	if int64(total) >= maxBlockSize+skippableFrameHeader {
+		return dst, fmt.Errorf("s2: requested skippable frame (%d) >= max 1<<24", total)
+	}
+	// Chunk type 0xfe "Section 4.4 Padding (chunk type 0xfe)"
+	dst = append(dst, chunkTypePadding)
+	f := uint32(total - skippableFrameHeader)
+	// Add chunk length.
+	dst = append(dst, uint8(f), uint8(f>>8), uint8(f>>16))
+	// Add data
+	start := len(dst)
+	dst = append(dst, make([]byte, f)...)
+	_, err := io.ReadFull(r, dst[start:])
+	return dst, err
+}
+
+var errClosed = errors.New("s2: Writer is closed")
+
+// WriterOption is an option for creating a encoder.
+type WriterOption func(*Writer) error
+
+// WriterConcurrency will set the concurrency,
+// meaning the maximum number of decoders to run concurrently.
+// The value supplied must be at least 1.
+// By default this will be set to GOMAXPROCS.
+func WriterConcurrency(n int) WriterOption {
+	return func(w *Writer) error {
+		if n <= 0 {
+			return errors.New("concurrency must be at least 1")
+		}
+		w.concurrency = n
+		return nil
+	}
+}
+
+// WriterAddIndex will append an index to the end of a stream
+// when it is closed.
+func WriterAddIndex() WriterOption {
+	return func(w *Writer) error {
+		w.appendIndex = true
+		return nil
+	}
+}
+
+// WriterBetterCompression will enable better compression.
+// EncodeBetter compresses better than Encode but typically with a
+// 10-40% speed decrease on both compression and decompression.
+func WriterBetterCompression() WriterOption {
+	return func(w *Writer) error {
+		w.level = levelBetter
+		return nil
+	}
+}
+
+// WriterBestCompression will enable better compression.
+// EncodeBest compresses better than Encode but typically with a
+// big speed decrease on compression.
+func WriterBestCompression() WriterOption {
+	return func(w *Writer) error {
+		w.level = levelBest
+		return nil
+	}
+}
+
+// WriterUncompressed will bypass compression.
+// The stream will be written as uncompressed blocks only.
+// If concurrency is > 1 CRC and output will still be done async.
+func WriterUncompressed() WriterOption {
+	return func(w *Writer) error {
+		w.level = levelUncompressed
+		return nil
+	}
+}
+
+// WriterBufferDone will perform a callback when EncodeBuffer has finished
+// writing a buffer to the output and the buffer can safely be reused.
+// If the buffer was split into several blocks, it will be sent after the last block.
+// Callbacks will not be done concurrently.
+func WriterBufferDone(fn func(b []byte)) WriterOption {
+	return func(w *Writer) error {
+		w.bufferCB = fn
+		return nil
+	}
+}
+
+// WriterBlockSize allows to override the default block size.
+// Blocks will be this size or smaller.
+// Minimum size is 4KB and maximum size is 4MB.
+//
+// Bigger blocks may give bigger throughput on systems with many cores,
+// and will increase compression slightly, but it will limit the possible
+// concurrency for smaller payloads for both encoding and decoding.
+// Default block size is 1MB.
+//
+// When writing Snappy compatible output using WriterSnappyCompat,
+// the maximum block size is 64KB.
+func WriterBlockSize(n int) WriterOption {
+	return func(w *Writer) error {
+		if w.snappy && n > maxSnappyBlockSize || n < minBlockSize {
+			return errors.New("s2: block size too large. Must be <= 64K and >=4KB on for snappy compatible output")
+		}
+		if n > maxBlockSize || n < minBlockSize {
+			return errors.New("s2: block size too large. Must be <= 4MB and >=4KB")
+		}
+		w.blockSize = n
+		return nil
+	}
+}
+
+// WriterPadding will add padding to all output so the size will be a multiple of n.
+// This can be used to obfuscate the exact output size or make blocks of a certain size.
+// The contents will be a skippable frame, so it will be invisible by the decoder.
+// n must be > 0 and <= 4MB.
+// The padded area will be filled with data from crypto/rand.Reader.
+// The padding will be applied whenever Close is called on the writer.
+func WriterPadding(n int) WriterOption {
+	return func(w *Writer) error {
+		if n <= 0 {
+			return fmt.Errorf("s2: padding must be at least 1")
+		}
+		// No need to waste our time.
+		if n == 1 {
+			w.pad = 0
+		}
+		if n > maxBlockSize {
+			return fmt.Errorf("s2: padding must less than 4MB")
+		}
+		w.pad = n
+		return nil
+	}
+}
+
+// WriterPaddingSrc will get random data for padding from the supplied source.
+// By default crypto/rand is used.
+func WriterPaddingSrc(reader io.Reader) WriterOption {
+	return func(w *Writer) error {
+		w.randSrc = reader
+		return nil
+	}
+}
+
+// WriterSnappyCompat will write snappy compatible output.
+// The output can be decompressed using either snappy or s2.
+// If block size is more than 64KB it is set to that.
+func WriterSnappyCompat() WriterOption {
+	return func(w *Writer) error {
+		w.snappy = true
+		if w.blockSize > 64<<10 {
+			// We choose 8 bytes less than 64K, since that will make literal emits slightly more effective.
+			// And allows us to skip some size checks.
+			w.blockSize = (64 << 10) - 8
+		}
+		return nil
+	}
+}
+
+// WriterFlushOnWrite will compress blocks on each call to the Write function.
+//
+// This is quite inefficient as blocks size will depend on the write size.
+//
+// Use WriterConcurrency(1) to also make sure that output is flushed.
+// When Write calls return, otherwise they will be written when compression is done.
+func WriterFlushOnWrite() WriterOption {
+	return func(w *Writer) error {
+		w.flushOnWrite = true
+		return nil
+	}
+}
+
+// WriterCustomEncoder allows to override the encoder for blocks on the stream.
+// The function must compress 'src' into 'dst' and return the bytes used in dst as an integer.
+// Block size (initial varint) should not be added by the encoder.
+// Returning value 0 indicates the block could not be compressed.
+// Returning a negative value indicates that compression should be attempted.
+// The function should expect to be called concurrently.
+func WriterCustomEncoder(fn func(dst, src []byte) int) WriterOption {
+	return func(w *Writer) error {
+		w.customEnc = fn
+		return nil
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/s2sx.mod b/vendor/github.com/klauspost/compress/s2sx.mod
new file mode 100644
index 0000000000..5a4412f907
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2sx.mod
@@ -0,0 +1,4 @@
+module github.com/klauspost/compress
+
+go 1.19
+
diff --git a/vendor/github.com/klauspost/compress/s2sx.sum b/vendor/github.com/klauspost/compress/s2sx.sum
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vendor/github.com/klauspost/compress/snappy/.gitignore b/vendor/github.com/klauspost/compress/snappy/.gitignore
deleted file mode 100644
index 042091d9b3..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/.gitignore
+++ /dev/null
@@ -1,16 +0,0 @@
-cmd/snappytool/snappytool
-testdata/bench
-
-# These explicitly listed benchmark data files are for an obsolete version of
-# snappy_test.go.
-testdata/alice29.txt
-testdata/asyoulik.txt
-testdata/fireworks.jpeg
-testdata/geo.protodata
-testdata/html
-testdata/html_x_4
-testdata/kppkn.gtb
-testdata/lcet10.txt
-testdata/paper-100k.pdf
-testdata/plrabn12.txt
-testdata/urls.10K
diff --git a/vendor/github.com/klauspost/compress/snappy/AUTHORS b/vendor/github.com/klauspost/compress/snappy/AUTHORS
deleted file mode 100644
index bcfa19520a..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/AUTHORS
+++ /dev/null
@@ -1,15 +0,0 @@
-# This is the official list of Snappy-Go authors for copyright purposes.
-# This file is distinct from the CONTRIBUTORS files.
-# See the latter for an explanation.
-
-# Names should be added to this file as
-#	Name or Organization <email address>
-# The email address is not required for organizations.
-
-# Please keep the list sorted.
-
-Damian Gryski <dgryski@gmail.com>
-Google Inc.
-Jan Mercl <0xjnml@gmail.com>
-Rodolfo Carvalho <rhcarvalho@gmail.com>
-Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS b/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
deleted file mode 100644
index 931ae31606..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
+++ /dev/null
@@ -1,37 +0,0 @@
-# This is the official list of people who can contribute
-# (and typically have contributed) code to the Snappy-Go repository.
-# The AUTHORS file lists the copyright holders; this file
-# lists people.  For example, Google employees are listed here
-# but not in AUTHORS, because Google holds the copyright.
-#
-# The submission process automatically checks to make sure
-# that people submitting code are listed in this file (by email address).
-#
-# Names should be added to this file only after verifying that
-# the individual or the individual's organization has agreed to
-# the appropriate Contributor License Agreement, found here:
-#
-#     http://code.google.com/legal/individual-cla-v1.0.html
-#     http://code.google.com/legal/corporate-cla-v1.0.html
-#
-# The agreement for individuals can be filled out on the web.
-#
-# When adding J Random Contributor's name to this file,
-# either J's name or J's organization's name should be
-# added to the AUTHORS file, depending on whether the
-# individual or corporate CLA was used.
-
-# Names should be added to this file like so:
-#     Name <email address>
-
-# Please keep the list sorted.
-
-Damian Gryski <dgryski@gmail.com>
-Jan Mercl <0xjnml@gmail.com>
-Kai Backman <kaib@golang.org>
-Marc-Antoine Ruel <maruel@chromium.org>
-Nigel Tao <nigeltao@golang.org>
-Rob Pike <r@golang.org>
-Rodolfo Carvalho <rhcarvalho@gmail.com>
-Russ Cox <rsc@golang.org>
-Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/klauspost/compress/snappy/README b/vendor/github.com/klauspost/compress/snappy/README
deleted file mode 100644
index cea12879a0..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/README
+++ /dev/null
@@ -1,107 +0,0 @@
-The Snappy compression format in the Go programming language.
-
-To download and install from source:
-$ go get github.com/golang/snappy
-
-Unless otherwise noted, the Snappy-Go source files are distributed
-under the BSD-style license found in the LICENSE file.
-
-
-
-Benchmarks.
-
-The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
-or so files, the same set used by the C++ Snappy code (github.com/google/snappy
-and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
-3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29:
-
-"go test -test.bench=."
-
-_UFlat0-8         2.19GB/s ± 0%  html
-_UFlat1-8         1.41GB/s ± 0%  urls
-_UFlat2-8         23.5GB/s ± 2%  jpg
-_UFlat3-8         1.91GB/s ± 0%  jpg_200
-_UFlat4-8         14.0GB/s ± 1%  pdf
-_UFlat5-8         1.97GB/s ± 0%  html4
-_UFlat6-8          814MB/s ± 0%  txt1
-_UFlat7-8          785MB/s ± 0%  txt2
-_UFlat8-8          857MB/s ± 0%  txt3
-_UFlat9-8          719MB/s ± 1%  txt4
-_UFlat10-8        2.84GB/s ± 0%  pb
-_UFlat11-8        1.05GB/s ± 0%  gaviota
-
-_ZFlat0-8         1.04GB/s ± 0%  html
-_ZFlat1-8          534MB/s ± 0%  urls
-_ZFlat2-8         15.7GB/s ± 1%  jpg
-_ZFlat3-8          740MB/s ± 3%  jpg_200
-_ZFlat4-8         9.20GB/s ± 1%  pdf
-_ZFlat5-8          991MB/s ± 0%  html4
-_ZFlat6-8          379MB/s ± 0%  txt1
-_ZFlat7-8          352MB/s ± 0%  txt2
-_ZFlat8-8          396MB/s ± 1%  txt3
-_ZFlat9-8          327MB/s ± 1%  txt4
-_ZFlat10-8        1.33GB/s ± 1%  pb
-_ZFlat11-8         605MB/s ± 1%  gaviota
-
-
-
-"go test -test.bench=. -tags=noasm"
-
-_UFlat0-8          621MB/s ± 2%  html
-_UFlat1-8          494MB/s ± 1%  urls
-_UFlat2-8         23.2GB/s ± 1%  jpg
-_UFlat3-8         1.12GB/s ± 1%  jpg_200
-_UFlat4-8         4.35GB/s ± 1%  pdf
-_UFlat5-8          609MB/s ± 0%  html4
-_UFlat6-8          296MB/s ± 0%  txt1
-_UFlat7-8          288MB/s ± 0%  txt2
-_UFlat8-8          309MB/s ± 1%  txt3
-_UFlat9-8          280MB/s ± 1%  txt4
-_UFlat10-8         753MB/s ± 0%  pb
-_UFlat11-8         400MB/s ± 0%  gaviota
-
-_ZFlat0-8          409MB/s ± 1%  html
-_ZFlat1-8          250MB/s ± 1%  urls
-_ZFlat2-8         12.3GB/s ± 1%  jpg
-_ZFlat3-8          132MB/s ± 0%  jpg_200
-_ZFlat4-8         2.92GB/s ± 0%  pdf
-_ZFlat5-8          405MB/s ± 1%  html4
-_ZFlat6-8          179MB/s ± 1%  txt1
-_ZFlat7-8          170MB/s ± 1%  txt2
-_ZFlat8-8          189MB/s ± 1%  txt3
-_ZFlat9-8          164MB/s ± 1%  txt4
-_ZFlat10-8         479MB/s ± 1%  pb
-_ZFlat11-8         270MB/s ± 1%  gaviota
-
-
-
-For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
-are the numbers from C++ Snappy's
-
-make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
-
-BM_UFlat/0     2.4GB/s  html
-BM_UFlat/1     1.4GB/s  urls
-BM_UFlat/2    21.8GB/s  jpg
-BM_UFlat/3     1.5GB/s  jpg_200
-BM_UFlat/4    13.3GB/s  pdf
-BM_UFlat/5     2.1GB/s  html4
-BM_UFlat/6     1.0GB/s  txt1
-BM_UFlat/7   959.4MB/s  txt2
-BM_UFlat/8     1.0GB/s  txt3
-BM_UFlat/9   864.5MB/s  txt4
-BM_UFlat/10    2.9GB/s  pb
-BM_UFlat/11    1.2GB/s  gaviota
-
-BM_ZFlat/0   944.3MB/s  html (22.31 %)
-BM_ZFlat/1   501.6MB/s  urls (47.78 %)
-BM_ZFlat/2    14.3GB/s  jpg (99.95 %)
-BM_ZFlat/3   538.3MB/s  jpg_200 (73.00 %)
-BM_ZFlat/4     8.3GB/s  pdf (83.30 %)
-BM_ZFlat/5   903.5MB/s  html4 (22.52 %)
-BM_ZFlat/6   336.0MB/s  txt1 (57.88 %)
-BM_ZFlat/7   312.3MB/s  txt2 (61.91 %)
-BM_ZFlat/8   353.1MB/s  txt3 (54.99 %)
-BM_ZFlat/9   289.9MB/s  txt4 (66.26 %)
-BM_ZFlat/10    1.2GB/s  pb (19.68 %)
-BM_ZFlat/11  527.4MB/s  gaviota (37.72 %)
diff --git a/vendor/github.com/klauspost/compress/snappy/decode.go b/vendor/github.com/klauspost/compress/snappy/decode.go
deleted file mode 100644
index 72efb0353d..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/decode.go
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package snappy
-
-import (
-	"encoding/binary"
-	"errors"
-	"io"
-)
-
-var (
-	// ErrCorrupt reports that the input is invalid.
-	ErrCorrupt = errors.New("snappy: corrupt input")
-	// ErrTooLarge reports that the uncompressed length is too large.
-	ErrTooLarge = errors.New("snappy: decoded block is too large")
-	// ErrUnsupported reports that the input isn't supported.
-	ErrUnsupported = errors.New("snappy: unsupported input")
-
-	errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
-)
-
-// DecodedLen returns the length of the decoded block.
-func DecodedLen(src []byte) (int, error) {
-	v, _, err := decodedLen(src)
-	return v, err
-}
-
-// decodedLen returns the length of the decoded block and the number of bytes
-// that the length header occupied.
-func decodedLen(src []byte) (blockLen, headerLen int, err error) {
-	v, n := binary.Uvarint(src)
-	if n <= 0 || v > 0xffffffff {
-		return 0, 0, ErrCorrupt
-	}
-
-	const wordSize = 32 << (^uint(0) >> 32 & 1)
-	if wordSize == 32 && v > 0x7fffffff {
-		return 0, 0, ErrTooLarge
-	}
-	return int(v), n, nil
-}
-
-const (
-	decodeErrCodeCorrupt                  = 1
-	decodeErrCodeUnsupportedLiteralLength = 2
-)
-
-// Decode returns the decoded form of src. The returned slice may be a sub-
-// slice of dst if dst was large enough to hold the entire decoded block.
-// Otherwise, a newly allocated slice will be returned.
-//
-// The dst and src must not overlap. It is valid to pass a nil dst.
-func Decode(dst, src []byte) ([]byte, error) {
-	dLen, s, err := decodedLen(src)
-	if err != nil {
-		return nil, err
-	}
-	if dLen <= len(dst) {
-		dst = dst[:dLen]
-	} else {
-		dst = make([]byte, dLen)
-	}
-	switch decode(dst, src[s:]) {
-	case 0:
-		return dst, nil
-	case decodeErrCodeUnsupportedLiteralLength:
-		return nil, errUnsupportedLiteralLength
-	}
-	return nil, ErrCorrupt
-}
-
-// NewReader returns a new Reader that decompresses from r, using the framing
-// format described at
-// https://github.com/google/snappy/blob/master/framing_format.txt
-func NewReader(r io.Reader) *Reader {
-	return &Reader{
-		r:       r,
-		decoded: make([]byte, maxBlockSize),
-		buf:     make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
-	}
-}
-
-// Reader is an io.Reader that can read Snappy-compressed bytes.
-type Reader struct {
-	r       io.Reader
-	err     error
-	decoded []byte
-	buf     []byte
-	// decoded[i:j] contains decoded bytes that have not yet been passed on.
-	i, j       int
-	readHeader bool
-}
-
-// Reset discards any buffered data, resets all state, and switches the Snappy
-// reader to read from r. This permits reusing a Reader rather than allocating
-// a new one.
-func (r *Reader) Reset(reader io.Reader) {
-	r.r = reader
-	r.err = nil
-	r.i = 0
-	r.j = 0
-	r.readHeader = false
-}
-
-func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
-	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
-		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
-			r.err = ErrCorrupt
-		}
-		return false
-	}
-	return true
-}
-
-// Read satisfies the io.Reader interface.
-func (r *Reader) Read(p []byte) (int, error) {
-	if r.err != nil {
-		return 0, r.err
-	}
-	for {
-		if r.i < r.j {
-			n := copy(p, r.decoded[r.i:r.j])
-			r.i += n
-			return n, nil
-		}
-		if !r.readFull(r.buf[:4], true) {
-			return 0, r.err
-		}
-		chunkType := r.buf[0]
-		if !r.readHeader {
-			if chunkType != chunkTypeStreamIdentifier {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			r.readHeader = true
-		}
-		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
-		if chunkLen > len(r.buf) {
-			r.err = ErrUnsupported
-			return 0, r.err
-		}
-
-		// The chunk types are specified at
-		// https://github.com/google/snappy/blob/master/framing_format.txt
-		switch chunkType {
-		case chunkTypeCompressedData:
-			// Section 4.2. Compressed data (chunk type 0x00).
-			if chunkLen < checksumSize {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			buf := r.buf[:chunkLen]
-			if !r.readFull(buf, false) {
-				return 0, r.err
-			}
-			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
-			buf = buf[checksumSize:]
-
-			n, err := DecodedLen(buf)
-			if err != nil {
-				r.err = err
-				return 0, r.err
-			}
-			if n > len(r.decoded) {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			if _, err := Decode(r.decoded, buf); err != nil {
-				r.err = err
-				return 0, r.err
-			}
-			if crc(r.decoded[:n]) != checksum {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			r.i, r.j = 0, n
-			continue
-
-		case chunkTypeUncompressedData:
-			// Section 4.3. Uncompressed data (chunk type 0x01).
-			if chunkLen < checksumSize {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			buf := r.buf[:checksumSize]
-			if !r.readFull(buf, false) {
-				return 0, r.err
-			}
-			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
-			// Read directly into r.decoded instead of via r.buf.
-			n := chunkLen - checksumSize
-			if n > len(r.decoded) {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			if !r.readFull(r.decoded[:n], false) {
-				return 0, r.err
-			}
-			if crc(r.decoded[:n]) != checksum {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			r.i, r.j = 0, n
-			continue
-
-		case chunkTypeStreamIdentifier:
-			// Section 4.1. Stream identifier (chunk type 0xff).
-			if chunkLen != len(magicBody) {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			if !r.readFull(r.buf[:len(magicBody)], false) {
-				return 0, r.err
-			}
-			for i := 0; i < len(magicBody); i++ {
-				if r.buf[i] != magicBody[i] {
-					r.err = ErrCorrupt
-					return 0, r.err
-				}
-			}
-			continue
-		}
-
-		if chunkType <= 0x7f {
-			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
-			r.err = ErrUnsupported
-			return 0, r.err
-		}
-		// Section 4.4 Padding (chunk type 0xfe).
-		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
-		if !r.readFull(r.buf[:chunkLen], false) {
-			return 0, r.err
-		}
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_amd64.go b/vendor/github.com/klauspost/compress/snappy/decode_amd64.go
deleted file mode 100644
index fcd192b849..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/decode_amd64.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-package snappy
-
-// decode has the same semantics as in decode_other.go.
-//
-//go:noescape
-func decode(dst, src []byte) int
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_amd64.s b/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
deleted file mode 100644
index 1c66e37234..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-
-// The asm code generally follows the pure Go code in decode_other.go, except
-// where marked with a "!!!".
-
-// func decode(dst, src []byte) int
-//
-// All local variables fit into registers. The non-zero stack size is only to
-// spill registers and push args when issuing a CALL. The register allocation:
-//	- AX	scratch
-//	- BX	scratch
-//	- CX	length or x
-//	- DX	offset
-//	- SI	&src[s]
-//	- DI	&dst[d]
-//	+ R8	dst_base
-//	+ R9	dst_len
-//	+ R10	dst_base + dst_len
-//	+ R11	src_base
-//	+ R12	src_len
-//	+ R13	src_base + src_len
-//	- R14	used by doCopy
-//	- R15	used by doCopy
-//
-// The registers R8-R13 (marked with a "+") are set at the start of the
-// function, and after a CALL returns, and are not otherwise modified.
-//
-// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
-// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
-TEXT ·decode(SB), NOSPLIT, $48-56
-	// Initialize SI, DI and R8-R13.
-	MOVQ dst_base+0(FP), R8
-	MOVQ dst_len+8(FP), R9
-	MOVQ R8, DI
-	MOVQ R8, R10
-	ADDQ R9, R10
-	MOVQ src_base+24(FP), R11
-	MOVQ src_len+32(FP), R12
-	MOVQ R11, SI
-	MOVQ R11, R13
-	ADDQ R12, R13
-
-loop:
-	// for s < len(src)
-	CMPQ SI, R13
-	JEQ  end
-
-	// CX = uint32(src[s])
-	//
-	// switch src[s] & 0x03
-	MOVBLZX (SI), CX
-	MOVL    CX, BX
-	ANDL    $3, BX
-	CMPL    BX, $1
-	JAE     tagCopy
-
-	// ----------------------------------------
-	// The code below handles literal tags.
-
-	// case tagLiteral:
-	// x := uint32(src[s] >> 2)
-	// switch
-	SHRL $2, CX
-	CMPL CX, $60
-	JAE  tagLit60Plus
-
-	// case x < 60:
-	// s++
-	INCQ SI
-
-doLit:
-	// This is the end of the inner "switch", when we have a literal tag.
-	//
-	// We assume that CX == x and x fits in a uint32, where x is the variable
-	// used in the pure Go decode_other.go code.
-
-	// length = int(x) + 1
-	//
-	// Unlike the pure Go code, we don't need to check if length <= 0 because
-	// CX can hold 64 bits, so the increment cannot overflow.
-	INCQ CX
-
-	// Prepare to check if copying length bytes will run past the end of dst or
-	// src.
-	//
-	// AX = len(dst) - d
-	// BX = len(src) - s
-	MOVQ R10, AX
-	SUBQ DI, AX
-	MOVQ R13, BX
-	SUBQ SI, BX
-
-	// !!! Try a faster technique for short (16 or fewer bytes) copies.
-	//
-	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
-	//   goto callMemmove // Fall back on calling runtime·memmove.
-	// }
-	//
-	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
-	// against 21 instead of 16, because it cannot assume that all of its input
-	// is contiguous in memory and so it needs to leave enough source bytes to
-	// read the next tag without refilling buffers, but Go's Decode assumes
-	// contiguousness (the src argument is a []byte).
-	CMPQ CX, $16
-	JGT  callMemmove
-	CMPQ AX, $16
-	JLT  callMemmove
-	CMPQ BX, $16
-	JLT  callMemmove
-
-	// !!! Implement the copy from src to dst as a 16-byte load and store.
-	// (Decode's documentation says that dst and src must not overlap.)
-	//
-	// This always copies 16 bytes, instead of only length bytes, but that's
-	// OK. If the input is a valid Snappy encoding then subsequent iterations
-	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
-	// non-nil error), so the overrun will be ignored.
-	//
-	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
-	// 16-byte loads and stores. This technique probably wouldn't be as
-	// effective on architectures that are fussier about alignment.
-	MOVOU 0(SI), X0
-	MOVOU X0, 0(DI)
-
-	// d += length
-	// s += length
-	ADDQ CX, DI
-	ADDQ CX, SI
-	JMP  loop
-
-callMemmove:
-	// if length > len(dst)-d || length > len(src)-s { etc }
-	CMPQ CX, AX
-	JGT  errCorrupt
-	CMPQ CX, BX
-	JGT  errCorrupt
-
-	// copy(dst[d:], src[s:s+length])
-	//
-	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
-	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
-	// three registers to the stack, to save local variables across the CALL.
-	MOVQ DI, 0(SP)
-	MOVQ SI, 8(SP)
-	MOVQ CX, 16(SP)
-	MOVQ DI, 24(SP)
-	MOVQ SI, 32(SP)
-	MOVQ CX, 40(SP)
-	CALL runtime·memmove(SB)
-
-	// Restore local variables: unspill registers from the stack and
-	// re-calculate R8-R13.
-	MOVQ 24(SP), DI
-	MOVQ 32(SP), SI
-	MOVQ 40(SP), CX
-	MOVQ dst_base+0(FP), R8
-	MOVQ dst_len+8(FP), R9
-	MOVQ R8, R10
-	ADDQ R9, R10
-	MOVQ src_base+24(FP), R11
-	MOVQ src_len+32(FP), R12
-	MOVQ R11, R13
-	ADDQ R12, R13
-
-	// d += length
-	// s += length
-	ADDQ CX, DI
-	ADDQ CX, SI
-	JMP  loop
-
-tagLit60Plus:
-	// !!! This fragment does the
-	//
-	// s += x - 58; if uint(s) > uint(len(src)) { etc }
-	//
-	// checks. In the asm version, we code it once instead of once per switch case.
-	ADDQ CX, SI
-	SUBQ $58, SI
-	CMPQ SI, R13
-	JA   errCorrupt
-
-	// case x == 60:
-	CMPL CX, $61
-	JEQ  tagLit61
-	JA   tagLit62Plus
-
-	// x = uint32(src[s-1])
-	MOVBLZX -1(SI), CX
-	JMP     doLit
-
-tagLit61:
-	// case x == 61:
-	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
-	MOVWLZX -2(SI), CX
-	JMP     doLit
-
-tagLit62Plus:
-	CMPL CX, $62
-	JA   tagLit63
-
-	// case x == 62:
-	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
-	MOVWLZX -3(SI), CX
-	MOVBLZX -1(SI), BX
-	SHLL    $16, BX
-	ORL     BX, CX
-	JMP     doLit
-
-tagLit63:
-	// case x == 63:
-	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-	MOVL -4(SI), CX
-	JMP  doLit
-
-// The code above handles literal tags.
-// ----------------------------------------
-// The code below handles copy tags.
-
-tagCopy4:
-	// case tagCopy4:
-	// s += 5
-	ADDQ $5, SI
-
-	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
-	JA   errCorrupt
-
-	// length = 1 + int(src[s-5])>>2
-	SHRQ $2, CX
-	INCQ CX
-
-	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
-	MOVLQZX -4(SI), DX
-	JMP     doCopy
-
-tagCopy2:
-	// case tagCopy2:
-	// s += 3
-	ADDQ $3, SI
-
-	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
-	JA   errCorrupt
-
-	// length = 1 + int(src[s-3])>>2
-	SHRQ $2, CX
-	INCQ CX
-
-	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
-	MOVWQZX -2(SI), DX
-	JMP     doCopy
-
-tagCopy:
-	// We have a copy tag. We assume that:
-	//	- BX == src[s] & 0x03
-	//	- CX == src[s]
-	CMPQ BX, $2
-	JEQ  tagCopy2
-	JA   tagCopy4
-
-	// case tagCopy1:
-	// s += 2
-	ADDQ $2, SI
-
-	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
-	JA   errCorrupt
-
-	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
-	MOVQ    CX, DX
-	ANDQ    $0xe0, DX
-	SHLQ    $3, DX
-	MOVBQZX -1(SI), BX
-	ORQ     BX, DX
-
-	// length = 4 + int(src[s-2])>>2&0x7
-	SHRQ $2, CX
-	ANDQ $7, CX
-	ADDQ $4, CX
-
-doCopy:
-	// This is the end of the outer "switch", when we have a copy tag.
-	//
-	// We assume that:
-	//	- CX == length && CX > 0
-	//	- DX == offset
-
-	// if offset <= 0 { etc }
-	CMPQ DX, $0
-	JLE  errCorrupt
-
-	// if d < offset { etc }
-	MOVQ DI, BX
-	SUBQ R8, BX
-	CMPQ BX, DX
-	JLT  errCorrupt
-
-	// if length > len(dst)-d { etc }
-	MOVQ R10, BX
-	SUBQ DI, BX
-	CMPQ CX, BX
-	JGT  errCorrupt
-
-	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
-	//
-	// Set:
-	//	- R14 = len(dst)-d
-	//	- R15 = &dst[d-offset]
-	MOVQ R10, R14
-	SUBQ DI, R14
-	MOVQ DI, R15
-	SUBQ DX, R15
-
-	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
-	//
-	// First, try using two 8-byte load/stores, similar to the doLit technique
-	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
-	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
-	// and not one 16-byte load/store, and the first store has to be before the
-	// second load, due to the overlap if offset is in the range [8, 16).
-	//
-	// if length > 16 || offset < 8 || len(dst)-d < 16 {
-	//   goto slowForwardCopy
-	// }
-	// copy 16 bytes
-	// d += length
-	CMPQ CX, $16
-	JGT  slowForwardCopy
-	CMPQ DX, $8
-	JLT  slowForwardCopy
-	CMPQ R14, $16
-	JLT  slowForwardCopy
-	MOVQ 0(R15), AX
-	MOVQ AX, 0(DI)
-	MOVQ 8(R15), BX
-	MOVQ BX, 8(DI)
-	ADDQ CX, DI
-	JMP  loop
-
-slowForwardCopy:
-	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
-	// can still try 8-byte load stores, provided we can overrun up to 10 extra
-	// bytes. As above, the overrun will be fixed up by subsequent iterations
-	// of the outermost loop.
-	//
-	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
-	// commentary says:
-	//
-	// ----
-	//
-	// The main part of this loop is a simple copy of eight bytes at a time
-	// until we've copied (at least) the requested amount of bytes.  However,
-	// if d and d-offset are less than eight bytes apart (indicating a
-	// repeating pattern of length < 8), we first need to expand the pattern in
-	// order to get the correct results. For instance, if the buffer looks like
-	// this, with the eight-byte <d-offset> and <d> patterns marked as
-	// intervals:
-	//
-	//    abxxxxxxxxxxxx
-	//    [------]           d-offset
-	//      [------]         d
-	//
-	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
-	// once, after which we can move <d> two bytes without moving <d-offset>:
-	//
-	//    ababxxxxxxxxxx
-	//    [------]           d-offset
-	//        [------]       d
-	//
-	// and repeat the exercise until the two no longer overlap.
-	//
-	// This allows us to do very well in the special case of one single byte
-	// repeated many times, without taking a big hit for more general cases.
-	//
-	// The worst case of extra writing past the end of the match occurs when
-	// offset == 1 and length == 1; the last copy will read from byte positions
-	// [0..7] and write to [4..11], whereas it was only supposed to write to
-	// position 1. Thus, ten excess bytes.
-	//
-	// ----
-	//
-	// That "10 byte overrun" worst case is confirmed by Go's
-	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
-	// and finishSlowForwardCopy algorithm.
-	//
-	// if length > len(dst)-d-10 {
-	//   goto verySlowForwardCopy
-	// }
-	SUBQ $10, R14
-	CMPQ CX, R14
-	JGT  verySlowForwardCopy
-
-makeOffsetAtLeast8:
-	// !!! As above, expand the pattern so that offset >= 8 and we can use
-	// 8-byte load/stores.
-	//
-	// for offset < 8 {
-	//   copy 8 bytes from dst[d-offset:] to dst[d:]
-	//   length -= offset
-	//   d      += offset
-	//   offset += offset
-	//   // The two previous lines together means that d-offset, and therefore
-	//   // R15, is unchanged.
-	// }
-	CMPQ DX, $8
-	JGE  fixUpSlowForwardCopy
-	MOVQ (R15), BX
-	MOVQ BX, (DI)
-	SUBQ DX, CX
-	ADDQ DX, DI
-	ADDQ DX, DX
-	JMP  makeOffsetAtLeast8
-
-fixUpSlowForwardCopy:
-	// !!! Add length (which might be negative now) to d (implied by DI being
-	// &dst[d]) so that d ends up at the right place when we jump back to the
-	// top of the loop. Before we do that, though, we save DI to AX so that, if
-	// length is positive, copying the remaining length bytes will write to the
-	// right place.
-	MOVQ DI, AX
-	ADDQ CX, DI
-
-finishSlowForwardCopy:
-	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
-	// length means that we overrun, but as above, that will be fixed up by
-	// subsequent iterations of the outermost loop.
-	CMPQ CX, $0
-	JLE  loop
-	MOVQ (R15), BX
-	MOVQ BX, (AX)
-	ADDQ $8, R15
-	ADDQ $8, AX
-	SUBQ $8, CX
-	JMP  finishSlowForwardCopy
-
-verySlowForwardCopy:
-	// verySlowForwardCopy is a simple implementation of forward copy. In C
-	// parlance, this is a do/while loop instead of a while loop, since we know
-	// that length > 0. In Go syntax:
-	//
-	// for {
-	//   dst[d] = dst[d - offset]
-	//   d++
-	//   length--
-	//   if length == 0 {
-	//     break
-	//   }
-	// }
-	MOVB (R15), BX
-	MOVB BX, (DI)
-	INCQ R15
-	INCQ DI
-	DECQ CX
-	JNZ  verySlowForwardCopy
-	JMP  loop
-
-// The code above handles copy tags.
-// ----------------------------------------
-
-end:
-	// This is the end of the "for s < len(src)".
-	//
-	// if d != len(dst) { etc }
-	CMPQ DI, R10
-	JNE  errCorrupt
-
-	// return 0
-	MOVQ $0, ret+48(FP)
-	RET
-
-errCorrupt:
-	// return decodeErrCodeCorrupt
-	MOVQ $1, ret+48(FP)
-	RET
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_other.go b/vendor/github.com/klauspost/compress/snappy/decode_other.go
deleted file mode 100644
index 94a96c5d7b..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/decode_other.go
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64 appengine !gc noasm
-
-package snappy
-
-// decode writes the decoding of src to dst. It assumes that the varint-encoded
-// length of the decompressed bytes has already been read, and that len(dst)
-// equals that length.
-//
-// It returns 0 on success or a decodeErrCodeXxx error code on failure.
-func decode(dst, src []byte) int {
-	var d, s, offset, length int
-	for s < len(src) {
-		switch src[s] & 0x03 {
-		case tagLiteral:
-			x := uint32(src[s] >> 2)
-			switch {
-			case x < 60:
-				s++
-			case x == 60:
-				s += 2
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-1])
-			case x == 61:
-				s += 3
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-2]) | uint32(src[s-1])<<8
-			case x == 62:
-				s += 4
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
-			case x == 63:
-				s += 5
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-			}
-			length = int(x) + 1
-			if length <= 0 {
-				return decodeErrCodeUnsupportedLiteralLength
-			}
-			if length > len(dst)-d || length > len(src)-s {
-				return decodeErrCodeCorrupt
-			}
-			copy(dst[d:], src[s:s+length])
-			d += length
-			s += length
-			continue
-
-		case tagCopy1:
-			s += 2
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				return decodeErrCodeCorrupt
-			}
-			length = 4 + int(src[s-2])>>2&0x7
-			offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
-
-		case tagCopy2:
-			s += 3
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				return decodeErrCodeCorrupt
-			}
-			length = 1 + int(src[s-3])>>2
-			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
-
-		case tagCopy4:
-			s += 5
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				return decodeErrCodeCorrupt
-			}
-			length = 1 + int(src[s-5])>>2
-			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
-		}
-
-		if offset <= 0 || d < offset || length > len(dst)-d {
-			return decodeErrCodeCorrupt
-		}
-		// Copy from an earlier sub-slice of dst to a later sub-slice.
-		// If no overlap, use the built-in copy:
-		if offset > length {
-			copy(dst[d:d+length], dst[d-offset:])
-			d += length
-			continue
-		}
-
-		// Unlike the built-in copy function, this byte-by-byte copy always runs
-		// forwards, even if the slices overlap. Conceptually, this is:
-		//
-		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
-		//
-		// We align the slices into a and b and show the compiler they are the same size.
-		// This allows the loop to run without bounds checks.
-		a := dst[d : d+length]
-		b := dst[d-offset:]
-		b = b[:len(a)]
-		for i := range a {
-			a[i] = b[i]
-		}
-		d += length
-	}
-	if d != len(dst) {
-		return decodeErrCodeCorrupt
-	}
-	return 0
-}
diff --git a/vendor/github.com/klauspost/compress/snappy/encode.go b/vendor/github.com/klauspost/compress/snappy/encode.go
deleted file mode 100644
index 8d393e904b..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/encode.go
+++ /dev/null
@@ -1,285 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package snappy
-
-import (
-	"encoding/binary"
-	"errors"
-	"io"
-)
-
-// Encode returns the encoded form of src. The returned slice may be a sub-
-// slice of dst if dst was large enough to hold the entire encoded block.
-// Otherwise, a newly allocated slice will be returned.
-//
-// The dst and src must not overlap. It is valid to pass a nil dst.
-func Encode(dst, src []byte) []byte {
-	if n := MaxEncodedLen(len(src)); n < 0 {
-		panic(ErrTooLarge)
-	} else if len(dst) < n {
-		dst = make([]byte, n)
-	}
-
-	// The block starts with the varint-encoded length of the decompressed bytes.
-	d := binary.PutUvarint(dst, uint64(len(src)))
-
-	for len(src) > 0 {
-		p := src
-		src = nil
-		if len(p) > maxBlockSize {
-			p, src = p[:maxBlockSize], p[maxBlockSize:]
-		}
-		if len(p) < minNonLiteralBlockSize {
-			d += emitLiteral(dst[d:], p)
-		} else {
-			d += encodeBlock(dst[d:], p)
-		}
-	}
-	return dst[:d]
-}
-
-// inputMargin is the minimum number of extra input bytes to keep, inside
-// encodeBlock's inner loop. On some architectures, this margin lets us
-// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
-// literals can be implemented as a single load to and store from a 16-byte
-// register. That literal's actual length can be as short as 1 byte, so this
-// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
-// the encoding loop will fix up the copy overrun, and this inputMargin ensures
-// that we don't overrun the dst and src buffers.
-const inputMargin = 16 - 1
-
-// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
-// could be encoded with a copy tag. This is the minimum with respect to the
-// algorithm used by encodeBlock, not a minimum enforced by the file format.
-//
-// The encoded output must start with at least a 1 byte literal, as there are
-// no previous bytes to copy. A minimal (1 byte) copy after that, generated
-// from an emitCopy call in encodeBlock's main loop, would require at least
-// another inputMargin bytes, for the reason above: we want any emitLiteral
-// calls inside encodeBlock's main loop to use the fast path if possible, which
-// requires being able to overrun by inputMargin bytes. Thus,
-// minNonLiteralBlockSize equals 1 + 1 + inputMargin.
-//
-// The C++ code doesn't use this exact threshold, but it could, as discussed at
-// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
-// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
-// optimization. It should not affect the encoded form. This is tested by
-// TestSameEncodingAsCppShortCopies.
-const minNonLiteralBlockSize = 1 + 1 + inputMargin
-
-// MaxEncodedLen returns the maximum length of a snappy block, given its
-// uncompressed length.
-//
-// It will return a negative value if srcLen is too large to encode.
-func MaxEncodedLen(srcLen int) int {
-	n := uint64(srcLen)
-	if n > 0xffffffff {
-		return -1
-	}
-	// Compressed data can be defined as:
-	//    compressed := item* literal*
-	//    item       := literal* copy
-	//
-	// The trailing literal sequence has a space blowup of at most 62/60
-	// since a literal of length 60 needs one tag byte + one extra byte
-	// for length information.
-	//
-	// Item blowup is trickier to measure. Suppose the "copy" op copies
-	// 4 bytes of data. Because of a special check in the encoding code,
-	// we produce a 4-byte copy only if the offset is < 65536. Therefore
-	// the copy op takes 3 bytes to encode, and this type of item leads
-	// to at most the 62/60 blowup for representing literals.
-	//
-	// Suppose the "copy" op copies 5 bytes of data. If the offset is big
-	// enough, it will take 5 bytes to encode the copy op. Therefore the
-	// worst case here is a one-byte literal followed by a five-byte copy.
-	// That is, 6 bytes of input turn into 7 bytes of "compressed" data.
-	//
-	// This last factor dominates the blowup, so the final estimate is:
-	n = 32 + n + n/6
-	if n > 0xffffffff {
-		return -1
-	}
-	return int(n)
-}
-
-var errClosed = errors.New("snappy: Writer is closed")
-
-// NewWriter returns a new Writer that compresses to w.
-//
-// The Writer returned does not buffer writes. There is no need to Flush or
-// Close such a Writer.
-//
-// Deprecated: the Writer returned is not suitable for many small writes, only
-// for few large writes. Use NewBufferedWriter instead, which is efficient
-// regardless of the frequency and shape of the writes, and remember to Close
-// that Writer when done.
-func NewWriter(w io.Writer) *Writer {
-	return &Writer{
-		w:    w,
-		obuf: make([]byte, obufLen),
-	}
-}
-
-// NewBufferedWriter returns a new Writer that compresses to w, using the
-// framing format described at
-// https://github.com/google/snappy/blob/master/framing_format.txt
-//
-// The Writer returned buffers writes. Users must call Close to guarantee all
-// data has been forwarded to the underlying io.Writer. They may also call
-// Flush zero or more times before calling Close.
-func NewBufferedWriter(w io.Writer) *Writer {
-	return &Writer{
-		w:    w,
-		ibuf: make([]byte, 0, maxBlockSize),
-		obuf: make([]byte, obufLen),
-	}
-}
-
-// Writer is an io.Writer that can write Snappy-compressed bytes.
-type Writer struct {
-	w   io.Writer
-	err error
-
-	// ibuf is a buffer for the incoming (uncompressed) bytes.
-	//
-	// Its use is optional. For backwards compatibility, Writers created by the
-	// NewWriter function have ibuf == nil, do not buffer incoming bytes, and
-	// therefore do not need to be Flush'ed or Close'd.
-	ibuf []byte
-
-	// obuf is a buffer for the outgoing (compressed) bytes.
-	obuf []byte
-
-	// wroteStreamHeader is whether we have written the stream header.
-	wroteStreamHeader bool
-}
-
-// Reset discards the writer's state and switches the Snappy writer to write to
-// w. This permits reusing a Writer rather than allocating a new one.
-func (w *Writer) Reset(writer io.Writer) {
-	w.w = writer
-	w.err = nil
-	if w.ibuf != nil {
-		w.ibuf = w.ibuf[:0]
-	}
-	w.wroteStreamHeader = false
-}
-
-// Write satisfies the io.Writer interface.
-func (w *Writer) Write(p []byte) (nRet int, errRet error) {
-	if w.ibuf == nil {
-		// Do not buffer incoming bytes. This does not perform or compress well
-		// if the caller of Writer.Write writes many small slices. This
-		// behavior is therefore deprecated, but still supported for backwards
-		// compatibility with code that doesn't explicitly Flush or Close.
-		return w.write(p)
-	}
-
-	// The remainder of this method is based on bufio.Writer.Write from the
-	// standard library.
-
-	for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil {
-		var n int
-		if len(w.ibuf) == 0 {
-			// Large write, empty buffer.
-			// Write directly from p to avoid copy.
-			n, _ = w.write(p)
-		} else {
-			n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
-			w.ibuf = w.ibuf[:len(w.ibuf)+n]
-			w.Flush()
-		}
-		nRet += n
-		p = p[n:]
-	}
-	if w.err != nil {
-		return nRet, w.err
-	}
-	n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
-	w.ibuf = w.ibuf[:len(w.ibuf)+n]
-	nRet += n
-	return nRet, nil
-}
-
-func (w *Writer) write(p []byte) (nRet int, errRet error) {
-	if w.err != nil {
-		return 0, w.err
-	}
-	for len(p) > 0 {
-		obufStart := len(magicChunk)
-		if !w.wroteStreamHeader {
-			w.wroteStreamHeader = true
-			copy(w.obuf, magicChunk)
-			obufStart = 0
-		}
-
-		var uncompressed []byte
-		if len(p) > maxBlockSize {
-			uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
-		} else {
-			uncompressed, p = p, nil
-		}
-		checksum := crc(uncompressed)
-
-		// Compress the buffer, discarding the result if the improvement
-		// isn't at least 12.5%.
-		compressed := Encode(w.obuf[obufHeaderLen:], uncompressed)
-		chunkType := uint8(chunkTypeCompressedData)
-		chunkLen := 4 + len(compressed)
-		obufEnd := obufHeaderLen + len(compressed)
-		if len(compressed) >= len(uncompressed)-len(uncompressed)/8 {
-			chunkType = chunkTypeUncompressedData
-			chunkLen = 4 + len(uncompressed)
-			obufEnd = obufHeaderLen
-		}
-
-		// Fill in the per-chunk header that comes before the body.
-		w.obuf[len(magicChunk)+0] = chunkType
-		w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0)
-		w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8)
-		w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16)
-		w.obuf[len(magicChunk)+4] = uint8(checksum >> 0)
-		w.obuf[len(magicChunk)+5] = uint8(checksum >> 8)
-		w.obuf[len(magicChunk)+6] = uint8(checksum >> 16)
-		w.obuf[len(magicChunk)+7] = uint8(checksum >> 24)
-
-		if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil {
-			w.err = err
-			return nRet, err
-		}
-		if chunkType == chunkTypeUncompressedData {
-			if _, err := w.w.Write(uncompressed); err != nil {
-				w.err = err
-				return nRet, err
-			}
-		}
-		nRet += len(uncompressed)
-	}
-	return nRet, nil
-}
-
-// Flush flushes the Writer to its underlying io.Writer.
-func (w *Writer) Flush() error {
-	if w.err != nil {
-		return w.err
-	}
-	if len(w.ibuf) == 0 {
-		return nil
-	}
-	w.write(w.ibuf)
-	w.ibuf = w.ibuf[:0]
-	return w.err
-}
-
-// Close calls Flush and then closes the Writer.
-func (w *Writer) Close() error {
-	w.Flush()
-	ret := w.err
-	if w.err == nil {
-		w.err = errClosed
-	}
-	return ret
-}
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_amd64.go b/vendor/github.com/klauspost/compress/snappy/encode_amd64.go
deleted file mode 100644
index 150d91bc8b..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/encode_amd64.go
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-package snappy
-
-// emitLiteral has the same semantics as in encode_other.go.
-//
-//go:noescape
-func emitLiteral(dst, lit []byte) int
-
-// emitCopy has the same semantics as in encode_other.go.
-//
-//go:noescape
-func emitCopy(dst []byte, offset, length int) int
-
-// extendMatch has the same semantics as in encode_other.go.
-//
-//go:noescape
-func extendMatch(src []byte, i, j int) int
-
-// encodeBlock has the same semantics as in encode_other.go.
-//
-//go:noescape
-func encodeBlock(dst, src []byte) (d int)
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_amd64.s b/vendor/github.com/klauspost/compress/snappy/encode_amd64.s
deleted file mode 100644
index adfd979fe2..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/encode_amd64.s
+++ /dev/null
@@ -1,730 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-
-// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
-// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
-// https://github.com/golang/snappy/issues/29
-//
-// As a workaround, the package was built with a known good assembler, and
-// those instructions were disassembled by "objdump -d" to yield the
-//	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-// style comments, in AT&T asm syntax. Note that rsp here is a physical
-// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
-// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
-// fine on Go 1.6.
-
-// The asm code generally follows the pure Go code in encode_other.go, except
-// where marked with a "!!!".
-
-// ----------------------------------------------------------------------------
-
-// func emitLiteral(dst, lit []byte) int
-//
-// All local variables fit into registers. The register allocation:
-//	- AX	len(lit)
-//	- BX	n
-//	- DX	return value
-//	- DI	&dst[i]
-//	- R10	&lit[0]
-//
-// The 24 bytes of stack space is to call runtime·memmove.
-//
-// The unusual register allocation of local variables, such as R10 for the
-// source pointer, matches the allocation used at the call site in encodeBlock,
-// which makes it easier to manually inline this function.
-TEXT ·emitLiteral(SB), NOSPLIT, $24-56
-	MOVQ dst_base+0(FP), DI
-	MOVQ lit_base+24(FP), R10
-	MOVQ lit_len+32(FP), AX
-	MOVQ AX, DX
-	MOVL AX, BX
-	SUBL $1, BX
-
-	CMPL BX, $60
-	JLT  oneByte
-	CMPL BX, $256
-	JLT  twoBytes
-
-threeBytes:
-	MOVB $0xf4, 0(DI)
-	MOVW BX, 1(DI)
-	ADDQ $3, DI
-	ADDQ $3, DX
-	JMP  memmove
-
-twoBytes:
-	MOVB $0xf0, 0(DI)
-	MOVB BX, 1(DI)
-	ADDQ $2, DI
-	ADDQ $2, DX
-	JMP  memmove
-
-oneByte:
-	SHLB $2, BX
-	MOVB BX, 0(DI)
-	ADDQ $1, DI
-	ADDQ $1, DX
-
-memmove:
-	MOVQ DX, ret+48(FP)
-
-	// copy(dst[i:], lit)
-	//
-	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// DI, R10 and AX as arguments.
-	MOVQ DI, 0(SP)
-	MOVQ R10, 8(SP)
-	MOVQ AX, 16(SP)
-	CALL runtime·memmove(SB)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func emitCopy(dst []byte, offset, length int) int
-//
-// All local variables fit into registers. The register allocation:
-//	- AX	length
-//	- SI	&dst[0]
-//	- DI	&dst[i]
-//	- R11	offset
-//
-// The unusual register allocation of local variables, such as R11 for the
-// offset, matches the allocation used at the call site in encodeBlock, which
-// makes it easier to manually inline this function.
-TEXT ·emitCopy(SB), NOSPLIT, $0-48
-	MOVQ dst_base+0(FP), DI
-	MOVQ DI, SI
-	MOVQ offset+24(FP), R11
-	MOVQ length+32(FP), AX
-
-loop0:
-	// for length >= 68 { etc }
-	CMPL AX, $68
-	JLT  step1
-
-	// Emit a length 64 copy, encoded as 3 bytes.
-	MOVB $0xfe, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $64, AX
-	JMP  loop0
-
-step1:
-	// if length > 64 { etc }
-	CMPL AX, $64
-	JLE  step2
-
-	// Emit a length 60 copy, encoded as 3 bytes.
-	MOVB $0xee, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $60, AX
-
-step2:
-	// if length >= 12 || offset >= 2048 { goto step3 }
-	CMPL AX, $12
-	JGE  step3
-	CMPL R11, $2048
-	JGE  step3
-
-	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB R11, 1(DI)
-	SHRL $8, R11
-	SHLB $5, R11
-	SUBB $4, AX
-	SHLB $2, AX
-	ORB  AX, R11
-	ORB  $1, R11
-	MOVB R11, 0(DI)
-	ADDQ $2, DI
-
-	// Return the number of bytes written.
-	SUBQ SI, DI
-	MOVQ DI, ret+40(FP)
-	RET
-
-step3:
-	// Emit the remaining copy, encoded as 3 bytes.
-	SUBL $1, AX
-	SHLB $2, AX
-	ORB  $2, AX
-	MOVB AX, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-
-	// Return the number of bytes written.
-	SUBQ SI, DI
-	MOVQ DI, ret+40(FP)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func extendMatch(src []byte, i, j int) int
-//
-// All local variables fit into registers. The register allocation:
-//	- DX	&src[0]
-//	- SI	&src[j]
-//	- R13	&src[len(src) - 8]
-//	- R14	&src[len(src)]
-//	- R15	&src[i]
-//
-// The unusual register allocation of local variables, such as R15 for a source
-// pointer, matches the allocation used at the call site in encodeBlock, which
-// makes it easier to manually inline this function.
-TEXT ·extendMatch(SB), NOSPLIT, $0-48
-	MOVQ src_base+0(FP), DX
-	MOVQ src_len+8(FP), R14
-	MOVQ i+24(FP), R15
-	MOVQ j+32(FP), SI
-	ADDQ DX, R14
-	ADDQ DX, R15
-	ADDQ DX, SI
-	MOVQ R14, R13
-	SUBQ $8, R13
-
-cmp8:
-	// As long as we are 8 or more bytes before the end of src, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ SI, R13
-	JA   cmp1
-	MOVQ (R15), AX
-	MOVQ (SI), BX
-	CMPQ AX, BX
-	JNE  bsf
-	ADDQ $8, R15
-	ADDQ $8, SI
-	JMP  cmp8
-
-bsf:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs. The BSF instruction finds the
-	// least significant 1 bit, the amd64 architecture is little-endian, and
-	// the shift by 3 converts a bit index to a byte index.
-	XORQ AX, BX
-	BSFQ BX, BX
-	SHRQ $3, BX
-	ADDQ BX, SI
-
-	// Convert from &src[ret] to ret.
-	SUBQ DX, SI
-	MOVQ SI, ret+40(FP)
-	RET
-
-cmp1:
-	// In src's tail, compare 1 byte at a time.
-	CMPQ SI, R14
-	JAE  extendMatchEnd
-	MOVB (R15), AX
-	MOVB (SI), BX
-	CMPB AX, BX
-	JNE  extendMatchEnd
-	ADDQ $1, R15
-	ADDQ $1, SI
-	JMP  cmp1
-
-extendMatchEnd:
-	// Convert from &src[ret] to ret.
-	SUBQ DX, SI
-	MOVQ SI, ret+40(FP)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func encodeBlock(dst, src []byte) (d int)
-//
-// All local variables fit into registers, other than "var table". The register
-// allocation:
-//	- AX	.	.
-//	- BX	.	.
-//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
-//	- DX	64	&src[0], tableSize
-//	- SI	72	&src[s]
-//	- DI	80	&dst[d]
-//	- R9	88	sLimit
-//	- R10	.	&src[nextEmit]
-//	- R11	96	prevHash, currHash, nextHash, offset
-//	- R12	104	&src[base], skip
-//	- R13	.	&src[nextS], &src[len(src) - 8]
-//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
-//	- R15	112	candidate
-//
-// The second column (56, 64, etc) is the stack offset to spill the registers
-// when calling other functions. We could pack this slightly tighter, but it's
-// simpler to have a dedicated spill map independent of the function called.
-//
-// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
-// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
-// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
-TEXT ·encodeBlock(SB), 0, $32888-56
-	MOVQ dst_base+0(FP), DI
-	MOVQ src_base+24(FP), SI
-	MOVQ src_len+32(FP), R14
-
-	// shift, tableSize := uint32(32-8), 1<<8
-	MOVQ $24, CX
-	MOVQ $256, DX
-
-calcShift:
-	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
-	//	shift--
-	// }
-	CMPQ DX, $16384
-	JGE  varTable
-	CMPQ DX, R14
-	JGE  varTable
-	SUBQ $1, CX
-	SHLQ $1, DX
-	JMP  calcShift
-
-varTable:
-	// var table [maxTableSize]uint16
-	//
-	// In the asm code, unlike the Go code, we can zero-initialize only the
-	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
-	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
-	// 2048 writes that would zero-initialize all of table's 32768 bytes.
-	SHRQ $3, DX
-	LEAQ table-32768(SP), BX
-	PXOR X0, X0
-
-memclr:
-	MOVOU X0, 0(BX)
-	ADDQ  $16, BX
-	SUBQ  $1, DX
-	JNZ   memclr
-
-	// !!! DX = &src[0]
-	MOVQ SI, DX
-
-	// sLimit := len(src) - inputMargin
-	MOVQ R14, R9
-	SUBQ $15, R9
-
-	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
-	// change for the rest of the function.
-	MOVQ CX, 56(SP)
-	MOVQ DX, 64(SP)
-	MOVQ R9, 88(SP)
-
-	// nextEmit := 0
-	MOVQ DX, R10
-
-	// s := 1
-	ADDQ $1, SI
-
-	// nextHash := hash(load32(src, s), shift)
-	MOVL  0(SI), R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-outer:
-	// for { etc }
-
-	// skip := 32
-	MOVQ $32, R12
-
-	// nextS := s
-	MOVQ SI, R13
-
-	// candidate := 0
-	MOVQ $0, R15
-
-inner0:
-	// for { etc }
-
-	// s := nextS
-	MOVQ R13, SI
-
-	// bytesBetweenHashLookups := skip >> 5
-	MOVQ R12, R14
-	SHRQ $5, R14
-
-	// nextS = s + bytesBetweenHashLookups
-	ADDQ R14, R13
-
-	// skip += bytesBetweenHashLookups
-	ADDQ R14, R12
-
-	// if nextS > sLimit { goto emitRemainder }
-	MOVQ R13, AX
-	SUBQ DX, AX
-	CMPQ AX, R9
-	JA   emitRemainder
-
-	// candidate = int(table[nextHash])
-	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
-	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-	BYTE $0x4e
-	BYTE $0x0f
-	BYTE $0xb7
-	BYTE $0x7c
-	BYTE $0x5c
-	BYTE $0x78
-
-	// table[nextHash] = uint16(s)
-	MOVQ SI, AX
-	SUBQ DX, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// nextHash = hash(load32(src, nextS), shift)
-	MOVL  0(R13), R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// if load32(src, s) != load32(src, candidate) { continue } break
-	MOVL 0(SI), AX
-	MOVL (DX)(R15*1), BX
-	CMPL AX, BX
-	JNE  inner0
-
-fourByteMatch:
-	// As per the encode_other.go code:
-	//
-	// A 4-byte match has been found. We'll later see etc.
-
-	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
-	// on inputMargin in encode.go.
-	MOVQ SI, AX
-	SUBQ R10, AX
-	CMPQ AX, $16
-	JLE  emitLiteralFastPath
-
-	// ----------------------------------------
-	// Begin inline of the emitLiteral call.
-	//
-	// d += emitLiteral(dst[d:], src[nextEmit:s])
-
-	MOVL AX, BX
-	SUBL $1, BX
-
-	CMPL BX, $60
-	JLT  inlineEmitLiteralOneByte
-	CMPL BX, $256
-	JLT  inlineEmitLiteralTwoBytes
-
-inlineEmitLiteralThreeBytes:
-	MOVB $0xf4, 0(DI)
-	MOVW BX, 1(DI)
-	ADDQ $3, DI
-	JMP  inlineEmitLiteralMemmove
-
-inlineEmitLiteralTwoBytes:
-	MOVB $0xf0, 0(DI)
-	MOVB BX, 1(DI)
-	ADDQ $2, DI
-	JMP  inlineEmitLiteralMemmove
-
-inlineEmitLiteralOneByte:
-	SHLB $2, BX
-	MOVB BX, 0(DI)
-	ADDQ $1, DI
-
-inlineEmitLiteralMemmove:
-	// Spill local variables (registers) onto the stack; call; unspill.
-	//
-	// copy(dst[i:], lit)
-	//
-	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// DI, R10 and AX as arguments.
-	MOVQ DI, 0(SP)
-	MOVQ R10, 8(SP)
-	MOVQ AX, 16(SP)
-	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
-	MOVQ SI, 72(SP)
-	MOVQ DI, 80(SP)
-	MOVQ R15, 112(SP)
-	CALL runtime·memmove(SB)
-	MOVQ 56(SP), CX
-	MOVQ 64(SP), DX
-	MOVQ 72(SP), SI
-	MOVQ 80(SP), DI
-	MOVQ 88(SP), R9
-	MOVQ 112(SP), R15
-	JMP  inner1
-
-inlineEmitLiteralEnd:
-	// End inline of the emitLiteral call.
-	// ----------------------------------------
-
-emitLiteralFastPath:
-	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
-	MOVB AX, BX
-	SUBB $1, BX
-	SHLB $2, BX
-	MOVB BX, (DI)
-	ADDQ $1, DI
-
-	// !!! Implement the copy from lit to dst as a 16-byte load and store.
-	// (Encode's documentation says that dst and src must not overlap.)
-	//
-	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
-	// OK. Subsequent iterations will fix up the overrun.
-	//
-	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
-	// 16-byte loads and stores. This technique probably wouldn't be as
-	// effective on architectures that are fussier about alignment.
-	MOVOU 0(R10), X0
-	MOVOU X0, 0(DI)
-	ADDQ  AX, DI
-
-inner1:
-	// for { etc }
-
-	// base := s
-	MOVQ SI, R12
-
-	// !!! offset := base - candidate
-	MOVQ R12, R11
-	SUBQ R15, R11
-	SUBQ DX, R11
-
-	// ----------------------------------------
-	// Begin inline of the extendMatch call.
-	//
-	// s = extendMatch(src, candidate+4, s+4)
-
-	// !!! R14 = &src[len(src)]
-	MOVQ src_len+32(FP), R14
-	ADDQ DX, R14
-
-	// !!! R13 = &src[len(src) - 8]
-	MOVQ R14, R13
-	SUBQ $8, R13
-
-	// !!! R15 = &src[candidate + 4]
-	ADDQ $4, R15
-	ADDQ DX, R15
-
-	// !!! s += 4
-	ADDQ $4, SI
-
-inlineExtendMatchCmp8:
-	// As long as we are 8 or more bytes before the end of src, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ SI, R13
-	JA   inlineExtendMatchCmp1
-	MOVQ (R15), AX
-	MOVQ (SI), BX
-	CMPQ AX, BX
-	JNE  inlineExtendMatchBSF
-	ADDQ $8, R15
-	ADDQ $8, SI
-	JMP  inlineExtendMatchCmp8
-
-inlineExtendMatchBSF:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs. The BSF instruction finds the
-	// least significant 1 bit, the amd64 architecture is little-endian, and
-	// the shift by 3 converts a bit index to a byte index.
-	XORQ AX, BX
-	BSFQ BX, BX
-	SHRQ $3, BX
-	ADDQ BX, SI
-	JMP  inlineExtendMatchEnd
-
-inlineExtendMatchCmp1:
-	// In src's tail, compare 1 byte at a time.
-	CMPQ SI, R14
-	JAE  inlineExtendMatchEnd
-	MOVB (R15), AX
-	MOVB (SI), BX
-	CMPB AX, BX
-	JNE  inlineExtendMatchEnd
-	ADDQ $1, R15
-	ADDQ $1, SI
-	JMP  inlineExtendMatchCmp1
-
-inlineExtendMatchEnd:
-	// End inline of the extendMatch call.
-	// ----------------------------------------
-
-	// ----------------------------------------
-	// Begin inline of the emitCopy call.
-	//
-	// d += emitCopy(dst[d:], base-candidate, s-base)
-
-	// !!! length := s - base
-	MOVQ SI, AX
-	SUBQ R12, AX
-
-inlineEmitCopyLoop0:
-	// for length >= 68 { etc }
-	CMPL AX, $68
-	JLT  inlineEmitCopyStep1
-
-	// Emit a length 64 copy, encoded as 3 bytes.
-	MOVB $0xfe, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $64, AX
-	JMP  inlineEmitCopyLoop0
-
-inlineEmitCopyStep1:
-	// if length > 64 { etc }
-	CMPL AX, $64
-	JLE  inlineEmitCopyStep2
-
-	// Emit a length 60 copy, encoded as 3 bytes.
-	MOVB $0xee, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $60, AX
-
-inlineEmitCopyStep2:
-	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
-	CMPL AX, $12
-	JGE  inlineEmitCopyStep3
-	CMPL R11, $2048
-	JGE  inlineEmitCopyStep3
-
-	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB R11, 1(DI)
-	SHRL $8, R11
-	SHLB $5, R11
-	SUBB $4, AX
-	SHLB $2, AX
-	ORB  AX, R11
-	ORB  $1, R11
-	MOVB R11, 0(DI)
-	ADDQ $2, DI
-	JMP  inlineEmitCopyEnd
-
-inlineEmitCopyStep3:
-	// Emit the remaining copy, encoded as 3 bytes.
-	SUBL $1, AX
-	SHLB $2, AX
-	ORB  $2, AX
-	MOVB AX, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-
-inlineEmitCopyEnd:
-	// End inline of the emitCopy call.
-	// ----------------------------------------
-
-	// nextEmit = s
-	MOVQ SI, R10
-
-	// if s >= sLimit { goto emitRemainder }
-	MOVQ SI, AX
-	SUBQ DX, AX
-	CMPQ AX, R9
-	JAE  emitRemainder
-
-	// As per the encode_other.go code:
-	//
-	// We could immediately etc.
-
-	// x := load64(src, s-1)
-	MOVQ -1(SI), R14
-
-	// prevHash := hash(uint32(x>>0), shift)
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// table[prevHash] = uint16(s-1)
-	MOVQ SI, AX
-	SUBQ DX, AX
-	SUBQ $1, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// currHash := hash(uint32(x>>8), shift)
-	SHRQ  $8, R14
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// candidate = int(table[currHash])
-	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
-	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-	BYTE $0x4e
-	BYTE $0x0f
-	BYTE $0xb7
-	BYTE $0x7c
-	BYTE $0x5c
-	BYTE $0x78
-
-	// table[currHash] = uint16(s)
-	ADDQ $1, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// if uint32(x>>8) == load32(src, candidate) { continue }
-	MOVL (DX)(R15*1), BX
-	CMPL R14, BX
-	JEQ  inner1
-
-	// nextHash = hash(uint32(x>>16), shift)
-	SHRQ  $8, R14
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// s++
-	ADDQ $1, SI
-
-	// break out of the inner1 for loop, i.e. continue the outer loop.
-	JMP outer
-
-emitRemainder:
-	// if nextEmit < len(src) { etc }
-	MOVQ src_len+32(FP), AX
-	ADDQ DX, AX
-	CMPQ R10, AX
-	JEQ  encodeBlockEnd
-
-	// d += emitLiteral(dst[d:], src[nextEmit:])
-	//
-	// Push args.
-	MOVQ DI, 0(SP)
-	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ R10, 24(SP)
-	SUBQ R10, AX
-	MOVQ AX, 32(SP)
-	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
-
-	// Spill local variables (registers) onto the stack; call; unspill.
-	MOVQ DI, 80(SP)
-	CALL ·emitLiteral(SB)
-	MOVQ 80(SP), DI
-
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADDQ 48(SP), DI
-
-encodeBlockEnd:
-	MOVQ dst_base+0(FP), AX
-	SUBQ AX, DI
-	MOVQ DI, d+48(FP)
-	RET
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_other.go b/vendor/github.com/klauspost/compress/snappy/encode_other.go
deleted file mode 100644
index dbcae905e6..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/encode_other.go
+++ /dev/null
@@ -1,238 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64 appengine !gc noasm
-
-package snappy
-
-func load32(b []byte, i int) uint32 {
-	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load64(b []byte, i int) uint64 {
-	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
-}
-
-// emitLiteral writes a literal chunk and returns the number of bytes written.
-//
-// It assumes that:
-//	dst is long enough to hold the encoded bytes
-//	1 <= len(lit) && len(lit) <= 65536
-func emitLiteral(dst, lit []byte) int {
-	i, n := 0, uint(len(lit)-1)
-	switch {
-	case n < 60:
-		dst[0] = uint8(n)<<2 | tagLiteral
-		i = 1
-	case n < 1<<8:
-		dst[0] = 60<<2 | tagLiteral
-		dst[1] = uint8(n)
-		i = 2
-	default:
-		dst[0] = 61<<2 | tagLiteral
-		dst[1] = uint8(n)
-		dst[2] = uint8(n >> 8)
-		i = 3
-	}
-	return i + copy(dst[i:], lit)
-}
-
-// emitCopy writes a copy chunk and returns the number of bytes written.
-//
-// It assumes that:
-//	dst is long enough to hold the encoded bytes
-//	1 <= offset && offset <= 65535
-//	4 <= length && length <= 65535
-func emitCopy(dst []byte, offset, length int) int {
-	i := 0
-	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
-	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
-	// length emitted down below is is a little lower (at 60 = 64 - 4), because
-	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
-	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
-	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
-	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
-	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
-	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
-	for length >= 68 {
-		// Emit a length 64 copy, encoded as 3 bytes.
-		dst[i+0] = 63<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		i += 3
-		length -= 64
-	}
-	if length > 64 {
-		// Emit a length 60 copy, encoded as 3 bytes.
-		dst[i+0] = 59<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		i += 3
-		length -= 60
-	}
-	if length >= 12 || offset >= 2048 {
-		// Emit the remaining copy, encoded as 3 bytes.
-		dst[i+0] = uint8(length-1)<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		return i + 3
-	}
-	// Emit the remaining copy, encoded as 2 bytes.
-	dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
-	dst[i+1] = uint8(offset)
-	return i + 2
-}
-
-// extendMatch returns the largest k such that k <= len(src) and that
-// src[i:i+k-j] and src[j:k] have the same contents.
-//
-// It assumes that:
-//	0 <= i && i < j && j <= len(src)
-func extendMatch(src []byte, i, j int) int {
-	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
-	}
-	return j
-}
-
-func hash(u, shift uint32) uint32 {
-	return (u * 0x1e35a7bd) >> shift
-}
-
-// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
-// assumes that the varint-encoded length of the decompressed bytes has already
-// been written.
-//
-// It also assumes that:
-//	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
-func encodeBlock(dst, src []byte) (d int) {
-	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
-	// The table element type is uint16, as s < sLimit and sLimit < len(src)
-	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
-	const (
-		maxTableSize = 1 << 14
-		// tableMask is redundant, but helps the compiler eliminate bounds
-		// checks.
-		tableMask = maxTableSize - 1
-	)
-	shift := uint32(32 - 8)
-	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
-		shift--
-	}
-	// In Go, all array elements are zero-initialized, so there is no advantage
-	// to a smaller tableSize per se. However, it matches the C++ algorithm,
-	// and in the asm versions of this code, we can get away with zeroing only
-	// the first tableSize elements.
-	var table [maxTableSize]uint16
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := len(src) - inputMargin
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := 0
-
-	// The encoded form must start with a literal, as there are no previous
-	// bytes to copy, so we start looking for hash matches at s == 1.
-	s := 1
-	nextHash := hash(load32(src, s), shift)
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := 32
-
-		nextS := s
-		candidate := 0
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidate = int(table[nextHash&tableMask])
-			table[nextHash&tableMask] = uint16(s)
-			nextHash = hash(load32(src, nextS), shift)
-			if load32(src, s) == load32(src, candidate) {
-				break
-			}
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		d += emitLiteral(dst[d:], src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-			base := s
-
-			// Extend the 4-byte match as long as possible.
-			//
-			// This is an inlined version of:
-			//	s = extendMatch(src, candidate+4, s+4)
-			s += 4
-			for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
-			}
-
-			d += emitCopy(dst[d:], base-candidate, s-base)
-			nextEmit = s
-			if s >= sLimit {
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-1 and at s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load64(src, s-1)
-			prevHash := hash(uint32(x>>0), shift)
-			table[prevHash&tableMask] = uint16(s - 1)
-			currHash := hash(uint32(x>>8), shift)
-			candidate = int(table[currHash&tableMask])
-			table[currHash&tableMask] = uint16(s)
-			if uint32(x>>8) != load32(src, candidate) {
-				nextHash = hash(uint32(x>>16), shift)
-				s++
-				break
-			}
-		}
-	}
-
-emitRemainder:
-	if nextEmit < len(src) {
-		d += emitLiteral(dst[d:], src[nextEmit:])
-	}
-	return d
-}
diff --git a/vendor/github.com/klauspost/compress/snappy/runbench.cmd b/vendor/github.com/klauspost/compress/snappy/runbench.cmd
deleted file mode 100644
index d24eb4b47c..0000000000
--- a/vendor/github.com/klauspost/compress/snappy/runbench.cmd
+++ /dev/null
@@ -1,2 +0,0 @@
-del old.txt
-go test -bench=. >>old.txt && go test -bench=. >>old.txt && go test -bench=. >>old.txt && benchstat -delta-test=ttest old.txt new.txt
diff --git a/vendor/github.com/klauspost/compress/zip/reader.go b/vendor/github.com/klauspost/compress/zip/reader.go
index 9fab6e3878..c3bcc88383 100644
--- a/vendor/github.com/klauspost/compress/zip/reader.go
+++ b/vendor/github.com/klauspost/compress/zip/reader.go
@@ -8,45 +8,71 @@ import (
 	"bufio"
 	"encoding/binary"
 	"errors"
-	"fmt"
 	"hash"
 	"hash/crc32"
 	"io"
+	"io/fs"
 	"os"
+	"path"
+	"path/filepath"
+	"sort"
+	"strings"
+	"sync"
 	"time"
+
+	"github.com/klauspost/compress/internal/godebug"
 )
 
 var (
-	ErrFormat    = errors.New("zip: not a valid zip file")
-	ErrAlgorithm = errors.New("zip: unsupported compression algorithm")
-	ErrChecksum  = errors.New("zip: checksum error")
+	ErrFormat       = errors.New("zip: not a valid zip file")
+	ErrAlgorithm    = errors.New("zip: unsupported compression algorithm")
+	ErrChecksum     = errors.New("zip: checksum error")
+	ErrInsecurePath = errors.New("zip: insecure file path")
 )
 
+// A Reader serves content from a ZIP archive.
 type Reader struct {
 	r             io.ReaderAt
 	File          []*File
 	Comment       string
 	decompressors map[uint16]Decompressor
+
+	// Some JAR files are zip files with a prefix that is a bash script.
+	// The baseOffset field is the start of the zip file proper.
+	baseOffset int64
+
+	// fileList is a list of files sorted by ename,
+	// for use by the Open method.
+	fileListOnce sync.Once
+	fileList     []fileListEntry
 }
 
+// A ReadCloser is a [Reader] that must be closed when no longer needed.
 type ReadCloser struct {
 	f *os.File
 	Reader
 }
 
+// A File is a single file in a ZIP archive.
+// The file information is in the embedded [FileHeader].
+// The file content can be accessed by calling [File.Open].
 type File struct {
 	FileHeader
 	zip          *Reader
 	zipr         io.ReaderAt
-	zipsize      int64
-	headerOffset int64
-}
-
-func (f *File) hasDataDescriptor() bool {
-	return f.Flags&0x8 != 0
+	headerOffset int64 // includes overall ZIP archive baseOffset
+	zip64        bool  // zip64 extended information extra field presence
 }
 
 // OpenReader will open the Zip file specified by name and return a ReadCloser.
+//
+// If any file inside the archive uses a non-local name
+// (as defined by [filepath.IsLocal]) or a name containing backslashes
+// and the GODEBUG environment variable contains `zipinsecurepath=0`,
+// OpenReader returns the reader with an ErrInsecurePath error.
+// A future version of Go may introduce this behavior by default.
+// Programs that want to accept non-local names can ignore
+// the ErrInsecurePath error and use the returned reader.
 func OpenReader(name string) (*ReadCloser, error) {
 	f, err := os.Open(name)
 	if err != nil {
@@ -58,50 +84,68 @@ func OpenReader(name string) (*ReadCloser, error) {
 		return nil, err
 	}
 	r := new(ReadCloser)
-	if err := r.init(f, fi.Size()); err != nil {
+	if err = r.init(f, fi.Size()); err != nil && err != ErrInsecurePath {
 		f.Close()
 		return nil, err
 	}
 	r.f = f
-	return r, nil
+	return r, err
 }
 
-// NewReader returns a new Reader reading from r, which is assumed to
+// NewReader returns a new [Reader] reading from r, which is assumed to
 // have the given size in bytes.
+//
+// If any file inside the archive uses a non-local name
+// (as defined by [filepath.IsLocal]) or a name containing backslashes
+// and the GODEBUG environment variable contains `zipinsecurepath=0`,
+// NewReader returns the reader with an [ErrInsecurePath] error.
+// A future version of Go may introduce this behavior by default.
+// Programs that want to accept non-local names can ignore
+// the [ErrInsecurePath] error and use the returned reader.
 func NewReader(r io.ReaderAt, size int64) (*Reader, error) {
 	if size < 0 {
 		return nil, errors.New("zip: size cannot be negative")
 	}
 	zr := new(Reader)
-	if err := zr.init(r, size); err != nil {
+	var err error
+	if err = zr.init(r, size); err != nil && err != ErrInsecurePath {
 		return nil, err
 	}
-	return zr, nil
+	return zr, err
 }
 
-func (z *Reader) init(r io.ReaderAt, size int64) error {
-	end, err := readDirectoryEnd(r, size)
+func (r *Reader) init(rdr io.ReaderAt, size int64) error {
+	end, baseOffset, err := readDirectoryEnd(rdr, size)
 	if err != nil {
 		return err
 	}
-	if end.directoryRecords > uint64(size)/fileHeaderLen {
-		return fmt.Errorf("archive/zip: TOC declares impossible %d files in %d byte zip", end.directoryRecords, size)
+	r.r = rdr
+	r.baseOffset = baseOffset
+	// Since the number of directory records is not validated, it is not
+	// safe to preallocate r.File without first checking that the specified
+	// number of files is reasonable, since a malformed archive may
+	// indicate it contains up to 1 << 128 - 1 files. Since each file has a
+	// header which will be _at least_ 30 bytes we can safely preallocate
+	// if (data size / 30) >= end.directoryRecords.
+	if end.directorySize < uint64(size) && (uint64(size)-end.directorySize)/30 >= end.directoryRecords {
+		r.File = make([]*File, 0, end.directoryRecords)
 	}
-	z.r = r
-	z.File = make([]*File, 0, end.directoryRecords)
-	z.Comment = end.comment
-	rs := io.NewSectionReader(r, 0, size)
-	if _, err = rs.Seek(int64(end.directoryOffset), io.SeekStart); err != nil {
+	r.Comment = end.comment
+	rs := io.NewSectionReader(rdr, 0, size)
+	if _, err = rs.Seek(r.baseOffset+int64(end.directoryOffset), io.SeekStart); err != nil {
 		return err
 	}
 	buf := bufio.NewReader(rs)
 
+	// Get once
+	zipinsecurepath := godebug.Get("zipinsecurepath") == "0"
+
 	// The count of files inside a zip is truncated to fit in a uint16.
 	// Gloss over this by reading headers until we encounter
 	// a bad one, and then only report an ErrFormat or UnexpectedEOF if
 	// the file count modulo 65536 is incorrect.
 	for {
-		f := &File{zip: z, zipr: r, zipsize: size}
+		f := &File{zip: r, zipr: rdr}
 		err = readDirectoryHeader(f, buf)
 		if err == ErrFormat || err == io.ErrUnexpectedEOF {
 			break
@@ -109,28 +153,42 @@ func (z *Reader) init(r io.ReaderAt, size int64) error {
 		if err != nil {
 			return err
 		}
-		z.File = append(z.File, f)
+		f.headerOffset += r.baseOffset
+		r.File = append(r.File, f)
 	}
-	if uint16(len(z.File)) != uint16(end.directoryRecords) { // only compare 16 bits here
+	if uint16(len(r.File)) != uint16(end.directoryRecords) { // only compare 16 bits here
 		// Return the readDirectoryHeader error if we read
 		// the wrong number of directory entries.
 		return err
 	}
+	if zipinsecurepath {
+		for _, f := range r.File {
+			if f.Name == "" {
+				// Zip permits an empty file name field.
+				continue
+			}
+			// The zip specification states that names must use forward slashes,
+			// so consider any backslashes in the name insecure.
+			if !filepath.IsLocal(f.Name) || strings.Contains(f.Name, `\`) {
+				return ErrInsecurePath
+			}
+		}
+	}
 	return nil
 }
 
 // RegisterDecompressor registers or overrides a custom decompressor for a
 // specific method ID. If a decompressor for a given method is not found,
-// Reader will default to looking up the decompressor at the package level.
-func (z *Reader) RegisterDecompressor(method uint16, dcomp Decompressor) {
-	if z.decompressors == nil {
-		z.decompressors = make(map[uint16]Decompressor)
+// [Reader] will default to looking up the decompressor at the package level.
+func (r *Reader) RegisterDecompressor(method uint16, dcomp Decompressor) {
+	if r.decompressors == nil {
+		r.decompressors = make(map[uint16]Decompressor)
 	}
-	z.decompressors[method] = dcomp
+	r.decompressors[method] = dcomp
 }
 
-func (z *Reader) decompressor(method uint16) Decompressor {
-	dcomp := z.decompressors[method]
+func (r *Reader) decompressor(method uint16) Decompressor {
+	dcomp := r.decompressors[method]
 	if dcomp == nil {
 		dcomp = decompressor(method)
 	}
@@ -145,7 +203,7 @@ func (rc *ReadCloser) Close() error {
 // DataOffset returns the offset of the file's possibly-compressed
 // data, relative to the beginning of the zip file.
 //
-// Most callers should instead use Open, which transparently
+// Most callers should instead use [File.Open], which transparently
 // decompresses data and verifies checksums.
 func (f *File) DataOffset() (offset int64, err error) {
 	bodyOffset, err := f.findBodyOffset()
@@ -155,13 +213,29 @@ func (f *File) DataOffset() (offset int64, err error) {
 	return f.headerOffset + bodyOffset, nil
 }
 
-// Open returns a ReadCloser that provides access to the File's contents.
+// Open returns a [ReadCloser] that provides access to the [File]'s contents.
 // Multiple files may be read concurrently.
 func (f *File) Open() (io.ReadCloser, error) {
 	bodyOffset, err := f.findBodyOffset()
 	if err != nil {
 		return nil, err
 	}
+	if strings.HasSuffix(f.Name, "/") {
+		// The ZIP specification (APPNOTE.TXT) specifies that directories, which
+		// are technically zero-byte files, must not have any associated file
+		// data. We previously tried failing here if f.CompressedSize64 != 0,
+		// but it turns out that a number of implementations (namely, the Java
+		// jar tool) don't properly set the storage method on directories
+		// resulting in a file with compressed size > 0 but uncompressed size ==
+		// 0. We still want to fail when a directory has associated uncompressed
+		// data, but we are tolerant of cases where the uncompressed size is
+		// zero but compressed size is not.
+		if f.UncompressedSize64 != 0 {
+			return &dirReader{ErrFormat}, nil
+		} else {
+			return &dirReader{io.EOF}, nil
+		}
+	}
 	size := int64(f.CompressedSize64)
 	r := io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset, size)
 	dcomp := f.zip.decompressor(f.Method)
@@ -182,14 +256,27 @@ func (f *File) Open() (io.ReadCloser, error) {
 	return rc, nil
 }
 
-// OpenRaw returns a Reader that returns the *compressed* output of the file.
+// OpenRaw returns a [Reader] that provides access to the [File]'s contents without
+// decompression.
 func (f *File) OpenRaw() (io.Reader, error) {
 	bodyOffset, err := f.findBodyOffset()
 	if err != nil {
 		return nil, err
 	}
-	size := int64(f.CompressedSize64)
-	return io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset, size), nil
+	r := io.NewSectionReader(f.zipr, f.headerOffset+bodyOffset, int64(f.CompressedSize64))
+	return r, nil
+}
+
+type dirReader struct {
+	err error
+}
+
+func (r *dirReader) Read([]byte) (int, error) {
+	return 0, r.err
+}
+
+func (r *dirReader) Close() error {
+	return nil
 }
 
 type checksumReader struct {
@@ -201,6 +288,10 @@ type checksumReader struct {
 	err   error     // sticky error
 }
 
+func (r *checksumReader) Stat() (fs.FileInfo, error) {
+	return headerFileInfo{&r.f.FileHeader}, nil
+}
+
 func (r *checksumReader) Read(b []byte) (n int, err error) {
 	if r.err != nil {
 		return 0, r.err
@@ -208,6 +299,9 @@ func (r *checksumReader) Read(b []byte) (n int, err error) {
 	n, err = r.rc.Read(b)
 	r.hash.Write(b[:n])
 	r.nread += uint64(n)
+	if r.nread > r.f.UncompressedSize64 {
+		return 0, ErrFormat
+	}
 	if err == nil {
 		return
 	}
@@ -331,6 +425,8 @@ parseExtras:
 
 		switch fieldTag {
 		case zip64ExtraID:
+			f.zip64 = true
+
 			// update directory values from the zip64 extra block.
 			// They should only be consulted if the sizes read earlier
 			// are maxed out.
@@ -374,8 +470,8 @@ parseExtras:
 
 				const ticksPerSecond = 1e7    // Windows timestamp resolution
 				ts := int64(attrBuf.uint64()) // ModTime since Windows epoch
-				secs := int64(ts / ticksPerSecond)
-				nsecs := (1e9 / ticksPerSecond) * int64(ts%ticksPerSecond)
+				secs := ts / ticksPerSecond
+				nsecs := (1e9 / ticksPerSecond) * (ts % ticksPerSecond)
 				epoch := time.Date(1601, time.January, 1, 0, 0, 0, 0, time.UTC)
 				modified = time.Unix(epoch.Unix()+secs, nsecs)
 			}
@@ -432,7 +528,6 @@ parseExtras:
 
 func readDataDescriptor(r io.Reader, f *File) error {
 	var buf [dataDescriptorLen]byte
-
 	// The spec says: "Although not originally assigned a
 	// signature, the value 0x08074b50 has commonly been adopted
 	// as a signature value for the data descriptor record.
@@ -470,7 +565,7 @@ func readDataDescriptor(r io.Reader, f *File) error {
 	return nil
 }
 
-func readDirectoryEnd(r io.ReaderAt, size int64) (dir *directoryEnd, err error) {
+func readDirectoryEnd(r io.ReaderAt, size int64) (dir *directoryEnd, baseOffset int64, err error) {
 	// look for directoryEndSignature in the last 1k, then in the last 65k
 	var buf []byte
 	var directoryEndOffset int64
@@ -480,7 +575,7 @@ func readDirectoryEnd(r io.ReaderAt, size int64) (dir *directoryEnd, err error)
 		}
 		buf = make([]byte, int(bLen))
 		if _, err := r.ReadAt(buf, size-bLen); err != nil && err != io.EOF {
-			return nil, err
+			return nil, 0, err
 		}
 		if p := findSignatureInBlock(buf); p >= 0 {
 			buf = buf[p:]
@@ -488,7 +583,7 @@ func readDirectoryEnd(r io.ReaderAt, size int64) (dir *directoryEnd, err error)
 			break
 		}
 		if i == 1 || bLen == size {
-			return nil, ErrFormat
+			return nil, 0, ErrFormat
 		}
 	}
 
@@ -505,7 +600,7 @@ func readDirectoryEnd(r io.ReaderAt, size int64) (dir *directoryEnd, err error)
 	}
 	l := int(d.commentLen)
 	if l > len(b) {
-		return nil, errors.New("zip: invalid comment length")
+		return nil, 0, errors.New("zip: invalid comment length")
 	}
 	d.comment = string(b[:l])
 
@@ -513,17 +608,40 @@ func readDirectoryEnd(r io.ReaderAt, size int64) (dir *directoryEnd, err error)
 	if d.directoryRecords == 0xffff || d.directorySize == 0xffff || d.directoryOffset == 0xffffffff {
 		p, err := findDirectory64End(r, directoryEndOffset)
 		if err == nil && p >= 0 {
+			directoryEndOffset = p
 			err = readDirectory64End(r, p, d)
 		}
 		if err != nil {
-			return nil, err
+			return nil, 0, err
 		}
 	}
+
+	maxInt64 := uint64(1<<63 - 1)
+	if d.directorySize > maxInt64 || d.directoryOffset > maxInt64 {
+		return nil, 0, ErrFormat
+	}
+
+	baseOffset = directoryEndOffset - int64(d.directorySize) - int64(d.directoryOffset)
+
 	// Make sure directoryOffset points to somewhere in our file.
-	if o := int64(d.directoryOffset); o < 0 || o >= size {
-		return nil, ErrFormat
+	if o := baseOffset + int64(d.directoryOffset); o < 0 || o >= size {
+		return nil, 0, ErrFormat
+	}
+
+	// If the directory end data tells us to use a non-zero baseOffset,
+	// but we would find a valid directory entry if we assume that the
+	// baseOffset is 0, then just use a baseOffset of 0.
+	// We've seen files in which the directory end data gives us
+	// an incorrect baseOffset.
+	if baseOffset > 0 {
+		off := int64(d.directoryOffset)
+		rs := io.NewSectionReader(r, off, size-off)
+		if readDirectoryHeader(&File{}, rs) == nil {
+			baseOffset = 0
+		}
 	}
-	return d, nil
+
+	return d, baseOffset, nil
 }
 
 // findDirectory64End tries to read the zip64 locator just before the
@@ -582,9 +700,13 @@ func findSignatureInBlock(b []byte) int {
 		if b[i] == 'P' && b[i+1] == 'K' && b[i+2] == 0x05 && b[i+3] == 0x06 {
 			// n is length of comment
 			n := int(b[i+directoryEndLen-2]) | int(b[i+directoryEndLen-1])<<8
-			if n+directoryEndLen+i <= len(b) {
-				return i
+			if n+directoryEndLen+i > len(b) {
+				// Truncated comment.
+				// Some parsers (such as Info-ZIP) ignore the truncated comment
+				// rather than treating it as a hard error.
+				return -1
 			}
+			return i
 		}
 	}
 	return -1
@@ -621,3 +743,313 @@ func (b *readBuf) sub(n int) readBuf {
 	*b = (*b)[n:]
 	return b2
 }
+
+// A fileListEntry is a File and its ename.
+// If file == nil, the fileListEntry describes a directory without metadata.
+type fileListEntry struct {
+	name  string
+	file  *File
+	isDir bool
+	isDup bool
+}
+
+type fileInfoDirEntry interface {
+	fs.FileInfo
+	fs.DirEntry
+}
+
+func (f *fileListEntry) stat() (fileInfoDirEntry, error) {
+	if f.isDup {
+		return nil, errors.New(f.name + ": duplicate entries in zip file")
+	}
+	if !f.isDir {
+		return headerFileInfo{&f.file.FileHeader}, nil
+	}
+	return f, nil
+}
+
+// Only used for directories.
+func (f *fileListEntry) Name() string      { _, elem, _ := split(f.name); return elem }
+func (f *fileListEntry) Size() int64       { return 0 }
+func (f *fileListEntry) Mode() fs.FileMode { return fs.ModeDir | 0555 }
+func (f *fileListEntry) Type() fs.FileMode { return fs.ModeDir }
+func (f *fileListEntry) IsDir() bool       { return true }
+func (f *fileListEntry) Sys() any          { return nil }
+
+func (f *fileListEntry) ModTime() time.Time {
+	if f.file == nil {
+		return time.Time{}
+	}
+	return f.file.FileHeader.Modified.UTC()
+}
+
+func (f *fileListEntry) Info() (fs.FileInfo, error) { return f, nil }
+
+func (f *fileListEntry) String() string {
+	return formatDirEntry(f)
+}
+
+// formatDirEntry returns a formatted version of dir for human readability.
+// Implementations of [DirEntry] can call this from a String method.
+// The outputs for a directory named subdir and a file named hello.go are:
+//
+//	d subdir/
+//	- hello.go
+//
+// TODO: Use fs.FormatDirEntry when Go 1.20 is no longer supported
+func formatDirEntry(dir fs.DirEntry) string {
+	name := dir.Name()
+	b := make([]byte, 0, 5+len(name))
+
+	// The Type method does not return any permission bits,
+	// so strip them from the string.
+	mode := dir.Type().String()
+	mode = mode[:len(mode)-9]
+
+	b = append(b, mode...)
+	b = append(b, ' ')
+	b = append(b, name...)
+	if dir.IsDir() {
+		b = append(b, '/')
+	}
+	return string(b)
+}
+
+// formatFileInfo returns a formatted version of info for human readability.
+// Implementations of [FileInfo] can call this from a String method.
+// The output for a file named "hello.go", 100 bytes, mode 0o644, created
+// January 1, 1970 at noon is
+//
+//	-rw-r--r-- 100 1970-01-01 12:00:00 hello.go
+//
+// TODO: Use fs.FormatFileInfo when Go 1.20 is no longer supported
+func formatFileInfo(info fs.FileInfo) string {
+	name := info.Name()
+	b := make([]byte, 0, 40+len(name))
+	b = append(b, info.Mode().String()...)
+	b = append(b, ' ')
+
+	size := info.Size()
+	var usize uint64
+	if size >= 0 {
+		usize = uint64(size)
+	} else {
+		b = append(b, '-')
+		usize = uint64(-size)
+	}
+	var buf [20]byte
+	i := len(buf) - 1
+	for usize >= 10 {
+		q := usize / 10
+		buf[i] = byte('0' + usize - q*10)
+		i--
+		usize = q
+	}
+	buf[i] = byte('0' + usize)
+	b = append(b, buf[i:]...)
+	b = append(b, ' ')
+
+	b = append(b, info.ModTime().Format(time.DateTime)...)
+	b = append(b, ' ')
+
+	b = append(b, name...)
+	if info.IsDir() {
+		b = append(b, '/')
+	}
+
+	return string(b)
+}
+
+// toValidName coerces name to be a valid name for fs.FS.Open.
+func toValidName(name string) string {
+	name = strings.ReplaceAll(name, `\`, `/`)
+	p := path.Clean(name)
+
+	p = strings.TrimPrefix(p, "/")
+
+	for strings.HasPrefix(p, "../") {
+		p = p[len("../"):]
+	}
+
+	return p
+}
+
+func (r *Reader) initFileList() {
+	r.fileListOnce.Do(func() {
+		// files and knownDirs map from a file/directory name
+		// to an index into the r.fileList entry that we are
+		// building. They are used to mark duplicate entries.
+		files := make(map[string]int)
+		knownDirs := make(map[string]int)
+
+		// dirs[name] is true if name is known to be a directory,
+		// because it appears as a prefix in a path.
+		dirs := make(map[string]bool)
+
+		for _, file := range r.File {
+			isDir := len(file.Name) > 0 && file.Name[len(file.Name)-1] == '/'
+			name := toValidName(file.Name)
+			if name == "" {
+				continue
+			}
+
+			if idx, ok := files[name]; ok {
+				r.fileList[idx].isDup = true
+				continue
+			}
+			if idx, ok := knownDirs[name]; ok {
+				r.fileList[idx].isDup = true
+				continue
+			}
+
+			for dir := path.Dir(name); dir != "."; dir = path.Dir(dir) {
+				dirs[dir] = true
+			}
+
+			idx := len(r.fileList)
+			entry := fileListEntry{
+				name:  name,
+				file:  file,
+				isDir: isDir,
+			}
+			r.fileList = append(r.fileList, entry)
+			if isDir {
+				knownDirs[name] = idx
+			} else {
+				files[name] = idx
+			}
+		}
+		for dir := range dirs {
+			if _, ok := knownDirs[dir]; !ok {
+				if idx, ok := files[dir]; ok {
+					r.fileList[idx].isDup = true
+				} else {
+					entry := fileListEntry{
+						name:  dir,
+						file:  nil,
+						isDir: true,
+					}
+					r.fileList = append(r.fileList, entry)
+				}
+			}
+		}
+
+		sort.Slice(r.fileList, func(i, j int) bool { return fileEntryLess(r.fileList[i].name, r.fileList[j].name) })
+	})
+}
+
+func fileEntryLess(x, y string) bool {
+	xdir, xelem, _ := split(x)
+	ydir, yelem, _ := split(y)
+	return xdir < ydir || xdir == ydir && xelem < yelem
+}
+
+// Open opens the named file in the ZIP archive,
+// using the semantics of fs.FS.Open:
+// paths are always slash separated, with no
+// leading / or ../ elements.
+func (r *Reader) Open(name string) (fs.File, error) {
+	r.initFileList()
+
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "open", Path: name, Err: fs.ErrInvalid}
+	}
+	e := r.openLookup(name)
+	if e == nil {
+		return nil, &fs.PathError{Op: "open", Path: name, Err: fs.ErrNotExist}
+	}
+	if e.isDir {
+		return &openDir{e, r.openReadDir(name), 0}, nil
+	}
+	rc, err := e.file.Open()
+	if err != nil {
+		return nil, err
+	}
+	return rc.(fs.File), nil
+}
+
+func split(name string) (dir, elem string, isDir bool) {
+	if len(name) > 0 && name[len(name)-1] == '/' {
+		isDir = true
+		name = name[:len(name)-1]
+	}
+	i := len(name) - 1
+	for i >= 0 && name[i] != '/' {
+		i--
+	}
+	if i < 0 {
+		return ".", name, isDir
+	}
+	return name[:i], name[i+1:], isDir
+}
+
+var dotFile = &fileListEntry{name: "./", isDir: true}
+
+func (r *Reader) openLookup(name string) *fileListEntry {
+	if name == "." {
+		return dotFile
+	}
+
+	dir, elem, _ := split(name)
+	files := r.fileList
+	i := sort.Search(len(files), func(i int) bool {
+		idir, ielem, _ := split(files[i].name)
+		return idir > dir || idir == dir && ielem >= elem
+	})
+	if i < len(files) {
+		fname := files[i].name
+		if fname == name || len(fname) == len(name)+1 && fname[len(name)] == '/' && fname[:len(name)] == name {
+			return &files[i]
+		}
+	}
+	return nil
+}
+
+func (r *Reader) openReadDir(dir string) []fileListEntry {
+	files := r.fileList
+	i := sort.Search(len(files), func(i int) bool {
+		idir, _, _ := split(files[i].name)
+		return idir >= dir
+	})
+	j := sort.Search(len(files), func(j int) bool {
+		jdir, _, _ := split(files[j].name)
+		return jdir > dir
+	})
+	return files[i:j]
+}
+
+type openDir struct {
+	e      *fileListEntry
+	files  []fileListEntry
+	offset int
+}
+
+func (d *openDir) Close() error               { return nil }
+func (d *openDir) Stat() (fs.FileInfo, error) { return d.e.stat() }
+
+func (d *openDir) Read([]byte) (int, error) {
+	return 0, &fs.PathError{Op: "read", Path: d.e.name, Err: errors.New("is a directory")}
+}
+
+func (d *openDir) ReadDir(count int) ([]fs.DirEntry, error) {
+	n := len(d.files) - d.offset
+	if count > 0 && n > count {
+		n = count
+	}
+	if n == 0 {
+		if count <= 0 {
+			return nil, nil
+		}
+		return nil, io.EOF
+	}
+	list := make([]fs.DirEntry, n)
+	for i := range list {
+		s, err := d.files[d.offset+i].stat()
+		if err != nil {
+			return nil, err
+		}
+		list[i] = s
+	}
+	d.offset += n
+	return list, nil
+}
diff --git a/vendor/github.com/klauspost/compress/zip/register.go b/vendor/github.com/klauspost/compress/zip/register.go
index 3bcb1ffd7a..8ea8889382 100644
--- a/vendor/github.com/klauspost/compress/zip/register.go
+++ b/vendor/github.com/klauspost/compress/zip/register.go
@@ -7,7 +7,6 @@ package zip
 import (
 	"errors"
 	"io"
-	"io/ioutil"
 	"sync"
 
 	"github.com/klauspost/compress/flate"
@@ -21,7 +20,7 @@ import (
 type Compressor func(w io.Writer) (io.WriteCloser, error)
 
 // A Decompressor returns a new decompressing reader, reading from r.
-// The ReadCloser's Close method must be used to release associated resources.
+// The [io.ReadCloser]'s Close method must be used to release associated resources.
 // The Decompressor itself must be safe to invoke from multiple goroutines
 // simultaneously, but each returned reader will be used only by
 // one goroutine at a time.
@@ -112,12 +111,12 @@ func init() {
 	compressors.Store(Store, Compressor(func(w io.Writer) (io.WriteCloser, error) { return &nopCloser{w}, nil }))
 	compressors.Store(Deflate, Compressor(func(w io.Writer) (io.WriteCloser, error) { return newFlateWriter(w), nil }))
 
-	decompressors.Store(Store, Decompressor(ioutil.NopCloser))
+	decompressors.Store(Store, Decompressor(io.NopCloser))
 	decompressors.Store(Deflate, Decompressor(newFlateReader))
 }
 
 // RegisterDecompressor allows custom decompressors for a specified method ID.
-// The common methods Store and Deflate are built in.
+// The common methods [Store] and [Deflate] are built in.
 func RegisterDecompressor(method uint16, dcomp Decompressor) {
 	if _, dup := decompressors.LoadOrStore(method, dcomp); dup {
 		panic("decompressor already registered")
@@ -125,7 +124,7 @@ func RegisterDecompressor(method uint16, dcomp Decompressor) {
 }
 
 // RegisterCompressor registers custom compressors for a specified method ID.
-// The common methods Store and Deflate are built in.
+// The common methods [Store] and [Deflate] are built in.
 func RegisterCompressor(method uint16, comp Compressor) {
 	if _, dup := compressors.LoadOrStore(method, comp); dup {
 		panic("compressor already registered")
diff --git a/vendor/github.com/klauspost/compress/zip/struct.go b/vendor/github.com/klauspost/compress/zip/struct.go
index 686e79781a..2637e9c235 100644
--- a/vendor/github.com/klauspost/compress/zip/struct.go
+++ b/vendor/github.com/klauspost/compress/zip/struct.go
@@ -5,7 +5,7 @@
 /*
 Package zip provides support for reading and writing ZIP archives.
 
-See: https://www.pkware.com/appnote
+See the [ZIP specification] for details.
 
 This package does not support disk spanning.
 
@@ -16,11 +16,13 @@ fields. The 64 bit fields will always contain the correct value and
 for normal archives both fields will be the same. For files requiring
 the ZIP64 format the 32 bit fields will be 0xffffffff and the 64 bit
 fields must be used instead.
+
+[ZIP specification]: https://support.pkware.com/pkzip/appnote
 */
 package zip
 
 import (
-	"os"
+	"io/fs"
 	"path"
 	"time"
 )
@@ -42,7 +44,7 @@ const (
 	directoryHeaderLen       = 46         // + filename + extra + comment
 	directoryEndLen          = 22         // + comment
 	dataDescriptorLen        = 16         // four uint32: descriptor signature, crc32, compressed size, size
-	dataDescriptor64Len      = 24         // descriptor with 8 byte sizes
+	dataDescriptor64Len      = 24         // two uint32: signature, crc32 | two uint64: compressed size, size
 	directory64LocLen        = 20         //
 	directory64EndLen        = 56         // + extra
 
@@ -65,7 +67,7 @@ const (
 	//
 	// IDs 0..31 are reserved for official use by PKWARE.
 	// IDs above that range are defined by third-party vendors.
-	// Since ZIP lacked high precision timestamps (nor a official specification
+	// Since ZIP lacked high precision timestamps (nor an official specification
 	// of the timezone used for the date fields), many competing extra fields
 	// have been invented. Pervasive use effectively makes them "official".
 	//
@@ -77,21 +79,16 @@ const (
 	infoZipUnixExtraID = 0x5855 // Info-ZIP Unix extension
 )
 
-// FileHeader describes a file within a zip file.
-// See the zip spec for details.
+// FileHeader describes a file within a ZIP file.
+// See the [ZIP specification] for details.
+//
+// [ZIP specification]: https://support.pkware.com/pkzip/appnote
 type FileHeader struct {
 	// Name is the name of the file.
 	//
 	// It must be a relative path, not start with a drive letter (such as "C:"),
 	// and must use forward slashes instead of back slashes. A trailing slash
 	// indicates that this file is a directory and should have no data.
-	//
-	// When reading zip files, the Name field is populated from
-	// the zip file directly and is not validated for correctness.
-	// It is the caller's responsibility to sanitize it as
-	// appropriate, including canonicalizing slash directions,
-	// validating that paths are relative, and preventing path
-	// traversal through filenames ("../../../").
 	Name string
 
 	// Comment is any arbitrary user-defined string shorter than 64KiB.
@@ -124,25 +121,51 @@ type FileHeader struct {
 	// When writing, an extended timestamp (which is timezone-agnostic) is
 	// always emitted. The legacy MS-DOS date field is encoded according to the
 	// location of the Modified time.
-	Modified     time.Time
-	ModifiedTime uint16 // Deprecated: Legacy MS-DOS date; use Modified instead.
-	ModifiedDate uint16 // Deprecated: Legacy MS-DOS time; use Modified instead.
-
-	CRC32              uint32
-	CompressedSize     uint32 // Deprecated: Use CompressedSize64 instead.
-	UncompressedSize   uint32 // Deprecated: Use UncompressedSize64 instead.
-	CompressedSize64   uint64
+	Modified time.Time
+
+	// ModifiedTime is an MS-DOS-encoded time.
+	//
+	// Deprecated: Use Modified instead.
+	ModifiedTime uint16
+
+	// ModifiedDate is an MS-DOS-encoded date.
+	//
+	// Deprecated: Use Modified instead.
+	ModifiedDate uint16
+
+	// CRC32 is the CRC32 checksum of the file content.
+	CRC32 uint32
+
+	// CompressedSize is the compressed size of the file in bytes.
+	// If either the uncompressed or compressed size of the file
+	// does not fit in 32 bits, CompressedSize is set to ^uint32(0).
+	//
+	// Deprecated: Use CompressedSize64 instead.
+	CompressedSize uint32
+
+	// UncompressedSize is the compressed size of the file in bytes.
+	// If either the uncompressed or compressed size of the file
+	// does not fit in 32 bits, CompressedSize is set to ^uint32(0).
+	//
+	// Deprecated: Use UncompressedSize64 instead.
+	UncompressedSize uint32
+
+	// CompressedSize64 is the compressed size of the file in bytes.
+	CompressedSize64 uint64
+
+	// UncompressedSize64 is the uncompressed size of the file in bytes.
 	UncompressedSize64 uint64
-	Extra              []byte
-	ExternalAttrs      uint32 // Meaning depends on CreatorVersion
+
+	Extra         []byte
+	ExternalAttrs uint32 // Meaning depends on CreatorVersion
 }
 
-// FileInfo returns an os.FileInfo for the FileHeader.
-func (h *FileHeader) FileInfo() os.FileInfo {
+// FileInfo returns an fs.FileInfo for the [FileHeader].
+func (h *FileHeader) FileInfo() fs.FileInfo {
 	return headerFileInfo{h}
 }
 
-// headerFileInfo implements os.FileInfo.
+// headerFileInfo implements [fs.FileInfo].
 type headerFileInfo struct {
 	fh *FileHeader
 }
@@ -161,17 +184,24 @@ func (fi headerFileInfo) ModTime() time.Time {
 	}
 	return fi.fh.Modified.UTC()
 }
-func (fi headerFileInfo) Mode() os.FileMode { return fi.fh.Mode() }
-func (fi headerFileInfo) Sys() interface{}  { return fi.fh }
+func (fi headerFileInfo) Mode() fs.FileMode { return fi.fh.Mode() }
+func (fi headerFileInfo) Type() fs.FileMode { return fi.fh.Mode().Type() }
+func (fi headerFileInfo) Sys() any          { return fi.fh }
 
-// FileInfoHeader creates a partially-populated FileHeader from an
-// os.FileInfo.
-// Because os.FileInfo's Name method returns only the base name of
+func (fi headerFileInfo) Info() (fs.FileInfo, error) { return fi, nil }
+
+func (fi headerFileInfo) String() string {
+	return formatFileInfo(fi)
+}
+
+// FileInfoHeader creates a partially-populated [FileHeader] from an
+// fs.FileInfo.
+// Because fs.FileInfo's Name method returns only the base name of
 // the file it describes, it may be necessary to modify the Name field
 // of the returned header to provide the full path name of the file.
 // If compression is desired, callers should set the FileHeader.Method
 // field; it is unset by default.
-func FileInfoHeader(fi os.FileInfo) (*FileHeader, error) {
+func FileInfoHeader(fi fs.FileInfo) (*FileHeader, error) {
 	size := fi.Size()
 	fh := &FileHeader{
 		Name:               fi.Name(),
@@ -215,7 +245,7 @@ func timeZone(offset time.Duration) *time.Location {
 
 // msDosTimeToTime converts an MS-DOS date and time into a time.Time.
 // The resolution is 2s.
-// See: https://msdn.microsoft.com/en-us/library/ms724247(v=VS.85).aspx
+// See: https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-dosdatetimetofiletime
 func msDosTimeToTime(dosDate, dosTime uint16) time.Time {
 	return time.Date(
 		// date bits 0-4: day of month; 5-8: month; 9-15: years since 1980
@@ -235,7 +265,7 @@ func msDosTimeToTime(dosDate, dosTime uint16) time.Time {
 
 // timeToMsDosTime converts a time.Time to an MS-DOS date and time.
 // The resolution is 2s.
-// See: https://msdn.microsoft.com/en-us/library/ms724274(v=VS.85).aspx
+// See: https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-filetimetodosdatetime
 func timeToMsDosTime(t time.Time) (fDate uint16, fTime uint16) {
 	fDate = uint16(t.Day() + int(t.Month())<<5 + (t.Year()-1980)<<9)
 	fTime = uint16(t.Second()/2 + t.Minute()<<5 + t.Hour()<<11)
@@ -243,17 +273,17 @@ func timeToMsDosTime(t time.Time) (fDate uint16, fTime uint16) {
 }
 
 // ModTime returns the modification time in UTC using the legacy
-// ModifiedDate and ModifiedTime fields.
+// [ModifiedDate] and [ModifiedTime] fields.
 //
-// Deprecated: Use Modified instead.
+// Deprecated: Use [Modified] instead.
 func (h *FileHeader) ModTime() time.Time {
 	return msDosTimeToTime(h.ModifiedDate, h.ModifiedTime)
 }
 
-// SetModTime sets the Modified, ModifiedTime, and ModifiedDate fields
+// SetModTime sets the [Modified], [ModifiedTime], and [ModifiedDate] fields
 // to the given time in UTC.
 //
-// Deprecated: Use Modified instead.
+// Deprecated: Use [Modified] instead.
 func (h *FileHeader) SetModTime(t time.Time) {
 	t = t.UTC() // Convert to UTC for compatibility
 	h.Modified = t
@@ -279,8 +309,8 @@ const (
 	msdosReadOnly = 0x01
 )
 
-// Mode returns the permission and mode bits for the FileHeader.
-func (h *FileHeader) Mode() (mode os.FileMode) {
+// Mode returns the permission and mode bits for the [FileHeader].
+func (h *FileHeader) Mode() (mode fs.FileMode) {
 	switch h.CreatorVersion >> 8 {
 	case creatorUnix, creatorMacOSX:
 		mode = unixModeToFileMode(h.ExternalAttrs >> 16)
@@ -288,18 +318,18 @@ func (h *FileHeader) Mode() (mode os.FileMode) {
 		mode = msdosModeToFileMode(h.ExternalAttrs)
 	}
 	if len(h.Name) > 0 && h.Name[len(h.Name)-1] == '/' {
-		mode |= os.ModeDir
+		mode |= fs.ModeDir
 	}
 	return mode
 }
 
-// SetMode changes the permission and mode bits for the FileHeader.
-func (h *FileHeader) SetMode(mode os.FileMode) {
+// SetMode changes the permission and mode bits for the [FileHeader].
+func (h *FileHeader) SetMode(mode fs.FileMode) {
 	h.CreatorVersion = h.CreatorVersion&0xff | creatorUnix<<8
 	h.ExternalAttrs = fileModeToUnixMode(mode) << 16
 
 	// set MSDOS attributes too, as the original zip does.
-	if mode&os.ModeDir != 0 {
+	if mode&fs.ModeDir != 0 {
 		h.ExternalAttrs |= msdosDir
 	}
 	if mode&0200 == 0 {
@@ -312,9 +342,13 @@ func (h *FileHeader) isZip64() bool {
 	return h.CompressedSize64 >= uint32max || h.UncompressedSize64 >= uint32max
 }
 
-func msdosModeToFileMode(m uint32) (mode os.FileMode) {
+func (h *FileHeader) hasDataDescriptor() bool {
+	return h.Flags&0x8 != 0
+}
+
+func msdosModeToFileMode(m uint32) (mode fs.FileMode) {
 	if m&msdosDir != 0 {
-		mode = os.ModeDir | 0777
+		mode = fs.ModeDir | 0777
 	} else {
 		mode = 0666
 	}
@@ -324,64 +358,62 @@ func msdosModeToFileMode(m uint32) (mode os.FileMode) {
 	return mode
 }
 
-func fileModeToUnixMode(mode os.FileMode) uint32 {
+func fileModeToUnixMode(mode fs.FileMode) uint32 {
 	var m uint32
-	switch mode & os.ModeType {
+	switch mode & fs.ModeType {
 	default:
 		m = s_IFREG
-	case os.ModeDir:
+	case fs.ModeDir:
 		m = s_IFDIR
-	case os.ModeSymlink:
+	case fs.ModeSymlink:
 		m = s_IFLNK
-	case os.ModeNamedPipe:
+	case fs.ModeNamedPipe:
 		m = s_IFIFO
-	case os.ModeSocket:
+	case fs.ModeSocket:
 		m = s_IFSOCK
-	case os.ModeDevice:
-		if mode&os.ModeCharDevice != 0 {
-			m = s_IFCHR
-		} else {
-			m = s_IFBLK
-		}
+	case fs.ModeDevice:
+		m = s_IFBLK
+	case fs.ModeDevice | fs.ModeCharDevice:
+		m = s_IFCHR
 	}
-	if mode&os.ModeSetuid != 0 {
+	if mode&fs.ModeSetuid != 0 {
 		m |= s_ISUID
 	}
-	if mode&os.ModeSetgid != 0 {
+	if mode&fs.ModeSetgid != 0 {
 		m |= s_ISGID
 	}
-	if mode&os.ModeSticky != 0 {
+	if mode&fs.ModeSticky != 0 {
 		m |= s_ISVTX
 	}
 	return m | uint32(mode&0777)
 }
 
-func unixModeToFileMode(m uint32) os.FileMode {
-	mode := os.FileMode(m & 0777)
+func unixModeToFileMode(m uint32) fs.FileMode {
+	mode := fs.FileMode(m & 0777)
 	switch m & s_IFMT {
 	case s_IFBLK:
-		mode |= os.ModeDevice
+		mode |= fs.ModeDevice
 	case s_IFCHR:
-		mode |= os.ModeDevice | os.ModeCharDevice
+		mode |= fs.ModeDevice | fs.ModeCharDevice
 	case s_IFDIR:
-		mode |= os.ModeDir
+		mode |= fs.ModeDir
 	case s_IFIFO:
-		mode |= os.ModeNamedPipe
+		mode |= fs.ModeNamedPipe
 	case s_IFLNK:
-		mode |= os.ModeSymlink
+		mode |= fs.ModeSymlink
 	case s_IFREG:
 		// nothing to do
 	case s_IFSOCK:
-		mode |= os.ModeSocket
+		mode |= fs.ModeSocket
 	}
 	if m&s_ISGID != 0 {
-		mode |= os.ModeSetgid
+		mode |= fs.ModeSetgid
 	}
 	if m&s_ISUID != 0 {
-		mode |= os.ModeSetuid
+		mode |= fs.ModeSetuid
 	}
 	if m&s_ISVTX != 0 {
-		mode |= os.ModeSticky
+		mode |= fs.ModeSticky
 	}
 	return mode
 }
diff --git a/vendor/github.com/klauspost/compress/zip/writer.go b/vendor/github.com/klauspost/compress/zip/writer.go
index 335b637c8f..b85bb91d86 100644
--- a/vendor/github.com/klauspost/compress/zip/writer.go
+++ b/vendor/github.com/klauspost/compress/zip/writer.go
@@ -11,6 +11,7 @@ import (
 	"hash"
 	"hash/crc32"
 	"io"
+	"io/fs"
 	"strings"
 	"unicode/utf8"
 )
@@ -20,16 +21,11 @@ var (
 	errLongExtra = errors.New("zip: FileHeader.Extra too long")
 )
 
-type lastWriter interface {
-	Close() error
-	Closed() bool
-}
-
 // Writer implements a zip file writer.
 type Writer struct {
 	cw          *countWriter
 	dir         []*header
-	last        lastWriter
+	last        *fileWriter
 	closed      bool
 	compressors map[uint16]Compressor
 	comment     string
@@ -42,9 +38,10 @@ type Writer struct {
 type header struct {
 	*FileHeader
 	offset uint64
+	raw    bool
 }
 
-// NewWriter returns a new Writer writing a zip file to w.
+// NewWriter returns a new [Writer] writing a zip file to w.
 func NewWriter(w io.Writer) *Writer {
 	return &Writer{cw: &countWriter{w: bufio.NewWriter(w)}}
 }
@@ -67,7 +64,7 @@ func (w *Writer) Flush() error {
 }
 
 // SetComment sets the end-of-central-directory comment field.
-// It can only be called before Close.
+// It can only be called before [Writer.Close].
 func (w *Writer) SetComment(comment string) error {
 	if len(comment) > uint16max {
 		return errors.New("zip: Writer.Comment too long")
@@ -77,10 +74,10 @@ func (w *Writer) SetComment(comment string) error {
 }
 
 // Close finishes writing the zip file by writing the central directory.
-// It does not Close the underlying writer.
+// It does not close the underlying writer.
 func (w *Writer) Close() error {
-	if w.last != nil && !w.last.Closed() {
-		if err := w.last.Close(); err != nil {
+	if w.last != nil && !w.last.closed {
+		if err := w.last.close(); err != nil {
 			return err
 		}
 		w.last = nil
@@ -211,14 +208,14 @@ func (w *Writer) Close() error {
 }
 
 // Create adds a file to the zip file using the provided name.
-// It returns a Writer to which the file contents should be written.
-// The file contents will be compressed using the Deflate method.
+// It returns a [Writer] to which the file contents should be written.
+// The file contents will be compressed using the [Deflate] method.
 // The name must be a relative path: it must not start with a drive
 // letter (e.g. C:) or leading slash, and only forward slashes are
 // allowed. To create a directory instead of a file, add a trailing
 // slash to the name.
-// The file's contents must be written to the io.Writer before the next
-// call to Create, CreateHeader, CreateHeaderRaw, or Close.
+// The file's contents must be written to the [io.Writer] before the next
+// call to [Writer.Create], [Writer.CreateHeader], or [Writer.Close].
 func (w *Writer) Create(name string) (io.Writer, error) {
 	header := &FileHeader{
 		Name:   name,
@@ -227,25 +224,6 @@ func (w *Writer) Create(name string) (io.Writer, error) {
 	return w.CreateHeader(header)
 }
 
-// Copy will copy raw content from input file.
-// Optionally a different name can be given to the new file.
-func (w *Writer) Copy(name string, src *File) error {
-	header := src.FileHeader
-	if name != "" {
-		header.Name = name
-	}
-	raw, err := src.OpenRaw()
-	if err != nil {
-		return err
-	}
-	wr, err := w.CreateHeaderRaw(&header)
-	if err != nil {
-		return err
-	}
-	_, err = io.Copy(wr, raw)
-	return err
-}
-
 // detectUTF8 reports whether s is a valid UTF-8 string, and whether the string
 // must be considered UTF-8 encoding (i.e., not compatible with CP-437, ASCII,
 // or any other common encoding).
@@ -269,22 +247,31 @@ func detectUTF8(s string) (valid, require bool) {
 	return true, require
 }
 
-// CreateHeader adds a file to the zip archive using the provided FileHeader
-// for the file metadata. Writer takes ownership of fh and may mutate
-// its fields. The caller must not modify fh after calling CreateHeader.
-//
-// This returns a Writer to which the file contents should be written.
-// The file's contents must be written to the io.Writer before the next
-// call to Create, Copy, CreateHeader, CreateHeaderRaw or Close.
-func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) {
-	if w.last != nil && !w.last.Closed() {
-		if err := w.last.Close(); err != nil {
-			return nil, err
+// prepare performs the bookkeeping operations required at the start of
+// CreateHeader and CreateRaw.
+func (w *Writer) prepare(fh *FileHeader) error {
+	if w.last != nil && !w.last.closed {
+		if err := w.last.close(); err != nil {
+			return err
 		}
 	}
 	if len(w.dir) > 0 && w.dir[len(w.dir)-1].FileHeader == fh {
 		// See https://golang.org/issue/11144 confusion.
-		return nil, errors.New("archive/zip: invalid duplicate FileHeader")
+		return errors.New("archive/zip: invalid duplicate FileHeader")
+	}
+	return nil
+}
+
+// CreateHeader adds a file to the zip archive using the provided [FileHeader]
+// for the file metadata. [Writer] takes ownership of fh and may mutate
+// its fields. The caller must not modify fh after calling [Writer.CreateHeader].
+//
+// This returns a [Writer] to which the file contents should be written.
+// The file's contents must be written to the io.Writer before the next
+// call to [Writer.Create], [Writer.CreateHeader], [Writer.CreateRaw], or [Writer.Close].
+func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) {
+	if err := w.prepare(fh); err != nil {
+		return nil, err
 	}
 
 	// The ZIP format has a sad state of affairs regarding character encoding.
@@ -343,7 +330,10 @@ func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) {
 		fh.Extra = append(fh.Extra, mbuf[:]...)
 	}
 
-	var ow io.Writer
+	var (
+		ow io.Writer
+		fw *fileWriter
+	)
 	h := &header{
 		FileHeader: fh,
 		offset:     uint64(w.cw.count),
@@ -364,11 +354,10 @@ func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) {
 		fh.UncompressedSize64 = 0
 
 		ow = dirWriter{}
-		w.last = nil
 	} else {
 		fh.Flags |= 0x8 // we will write a data descriptor
 
-		fw := &fileWriter{
+		fw = &fileWriter{
 			zipw:      w.cw,
 			compCount: &countWriter{w: w.cw},
 			crc32:     crc32.NewIEEE(),
@@ -385,137 +374,17 @@ func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) {
 		fw.rawCount = &countWriter{w: fw.comp}
 		fw.header = h
 		ow = fw
-		w.last = fw
-	}
-	w.dir = append(w.dir, h)
-	if err := writeHeader(w.cw, fh); err != nil {
-		return nil, err
-	}
-	// If we're creating a directory, fw is nil.
-	return ow, nil
-}
-
-// CreateHeaderRaw adds a file to the zip archive using the provided FileHeader
-// for the file metadata. Writer takes ownership of fh and may mutate
-// its fields. The caller must not modify fh after calling CreateHeaderRaw.
-//
-// This returns a Writer to which the compressed file contents should be written.
-// The file's contents must be written to the io.Writer before the next
-// call to Create, Copy, CreateHeader, CreateHeaderRaw or Close.
-//
-// Using this requires knowledge of populating the FileHeader correctly (the
-// UncompressedSize64 and CRC32 fields should be set and valid for the contents
-// written). For copying from an existing zip file, the Copy() function is
-// recommended.
-func (w *Writer) CreateHeaderRaw(fh *FileHeader) (io.Writer, error) {
-	if w.last != nil && !w.last.Closed() {
-		if err := w.last.Close(); err != nil {
-			return nil, err
-		}
-	}
-	if len(w.dir) > 0 && w.dir[len(w.dir)-1].FileHeader == fh {
-		// See https://golang.org/issue/11144 confusion.
-		return nil, errors.New("archive/zip: invalid duplicate FileHeader")
-	}
-
-	// The ZIP format has a sad state of affairs regarding character encoding.
-	// Officially, the name and comment fields are supposed to be encoded
-	// in CP-437 (which is mostly compatible with ASCII), unless the UTF-8
-	// flag bit is set. However, there are several problems:
-	//
-	//	* Many ZIP readers still do not support UTF-8.
-	//	* If the UTF-8 flag is cleared, several readers simply interpret the
-	//	name and comment fields as whatever the local system encoding is.
-	//
-	// In order to avoid breaking readers without UTF-8 support,
-	// we avoid setting the UTF-8 flag if the strings are CP-437 compatible.
-	// However, if the strings require multibyte UTF-8 encoding and is a
-	// valid UTF-8 string, then we set the UTF-8 bit.
-	//
-	// For the case, where the user explicitly wants to specify the encoding
-	// as UTF-8, they will need to set the flag bit themselves.
-	utf8Valid1, utf8Require1 := detectUTF8(fh.Name)
-	utf8Valid2, utf8Require2 := detectUTF8(fh.Comment)
-	switch {
-	case fh.NonUTF8:
-		fh.Flags &^= 0x800
-	case (utf8Require1 || utf8Require2) && (utf8Valid1 && utf8Valid2):
-		fh.Flags |= 0x800
-	}
-
-	fh.CreatorVersion = fh.CreatorVersion&0xff00 | zipVersion20 // preserve compatibility byte
-	fh.ReaderVersion = zipVersion20
-
-	// If Modified is set, this takes precedence over MS-DOS timestamp fields.
-	if !fh.Modified.IsZero() {
-		// Contrary to the FileHeader.SetModTime method, we intentionally
-		// do not convert to UTC, because we assume the user intends to encode
-		// the date using the specified timezone. A user may want this control
-		// because many legacy ZIP readers interpret the timestamp according
-		// to the local timezone.
-		//
-		// The timezone is only non-UTC if a user directly sets the Modified
-		// field directly themselves. All other approaches sets UTC.
-		fh.ModifiedDate, fh.ModifiedTime = timeToMsDosTime(fh.Modified)
-
-		// Use "extended timestamp" format since this is what Info-ZIP uses.
-		// Nearly every major ZIP implementation uses a different format,
-		// but at least most seem to be able to understand the other formats.
-		//
-		// This format happens to be identical for both local and central header
-		// if modification time is the only timestamp being encoded.
-		var mbuf [9]byte // 2*SizeOf(uint16) + SizeOf(uint8) + SizeOf(uint32)
-		mt := uint32(fh.Modified.Unix())
-		eb := writeBuf(mbuf[:])
-		eb.uint16(extTimeExtraID)
-		eb.uint16(5)  // Size: SizeOf(uint8) + SizeOf(uint32)
-		eb.uint8(1)   // Flags: ModTime
-		eb.uint32(mt) // ModTime
-		fh.Extra = append(fh.Extra, mbuf[:]...)
-	}
-
-	var ow io.Writer
-	h := &header{
-		FileHeader: fh,
-		offset:     uint64(w.cw.count),
-	}
-
-	if strings.HasSuffix(fh.Name, "/") {
-		// Set the compression method to Store to ensure data length is truly zero,
-		// which the writeHeader method always encodes for the size fields.
-		// This is necessary as most compression formats have non-zero lengths
-		// even when compressing an empty string.
-		fh.Method = Store
-		fh.Flags &^= 0x8 // we will not write a data descriptor
-
-		// Explicitly clear sizes as they have no meaning for directories.
-		fh.CompressedSize = 0
-		fh.CompressedSize64 = 0
-		fh.UncompressedSize = 0
-		fh.UncompressedSize64 = 0
-
-		ow = dirWriter{}
-		w.last = nil
-	} else {
-		fh.Flags |= 0x8 // we will write a data descriptor
-
-		fw := &rawWriter{
-			header:   h,
-			zipw:     w.cw,
-			rawCount: &countWriter{w: w.cw},
-		}
-		ow = fw
-		w.last = fw
 	}
 	w.dir = append(w.dir, h)
-	if err := writeHeader(w.cw, fh); err != nil {
+	if err := writeHeader(w.cw, h); err != nil {
 		return nil, err
 	}
 	// If we're creating a directory, fw is nil.
+	w.last = fw
 	return ow, nil
 }
 
-func writeHeader(w io.Writer, h *FileHeader) error {
+func writeHeader(w io.Writer, h *header) error {
 	const maxUint16 = 1<<16 - 1
 	if len(h.Name) > maxUint16 {
 		return errLongName
@@ -532,9 +401,20 @@ func writeHeader(w io.Writer, h *FileHeader) error {
 	b.uint16(h.Method)
 	b.uint16(h.ModifiedTime)
 	b.uint16(h.ModifiedDate)
-	b.uint32(0) // since we are writing a data descriptor crc32,
-	b.uint32(0) // compressed size,
-	b.uint32(0) // and uncompressed size should be zero
+	// In raw mode (caller does the compression), the values are either
+	// written here or in the trailing data descriptor based on the header
+	// flags.
+	if h.raw && !h.hasDataDescriptor() {
+		b.uint32(h.CRC32)
+		b.uint32(uint32(min64(h.CompressedSize64, uint32max)))
+		b.uint32(uint32(min64(h.UncompressedSize64, uint32max)))
+	} else {
+		// When this package handle the compression, these values are
+		// always written to the trailing data descriptor.
+		b.uint32(0) // crc32
+		b.uint32(0) // compressed size
+		b.uint32(0) // uncompressed size
+	}
 	b.uint16(uint16(len(h.Name)))
 	b.uint16(uint16(len(h.Extra)))
 	if _, err := w.Write(buf[:]); err != nil {
@@ -547,8 +427,67 @@ func writeHeader(w io.Writer, h *FileHeader) error {
 	return err
 }
 
+func min64(x, y uint64) uint64 {
+	if x < y {
+		return x
+	}
+	return y
+}
+
+// CreateRaw adds a file to the zip archive using the provided [FileHeader] and
+// returns a [Writer] to which the file contents should be written. The file's
+// contents must be written to the io.Writer before the next call to [Writer.Create],
+// [Writer.CreateHeader], [Writer.CreateRaw], or [Writer.Close].
+//
+// In contrast to [Writer.CreateHeader], the bytes passed to Writer are not compressed.
+func (w *Writer) CreateRaw(fh *FileHeader) (io.Writer, error) {
+	if err := w.prepare(fh); err != nil {
+		return nil, err
+	}
+
+	fh.CompressedSize = uint32(min64(fh.CompressedSize64, uint32max))
+	fh.UncompressedSize = uint32(min64(fh.UncompressedSize64, uint32max))
+
+	h := &header{
+		FileHeader: fh,
+		offset:     uint64(w.cw.count),
+		raw:        true,
+	}
+	w.dir = append(w.dir, h)
+	if err := writeHeader(w.cw, h); err != nil {
+		return nil, err
+	}
+
+	if strings.HasSuffix(fh.Name, "/") {
+		w.last = nil
+		return dirWriter{}, nil
+	}
+
+	fw := &fileWriter{
+		header: h,
+		zipw:   w.cw,
+	}
+	w.last = fw
+	return fw, nil
+}
+
+// Copy copies the file f (obtained from a [Reader]) into w. It copies the raw
+// form directly bypassing decompression, compression, and validation.
+func (w *Writer) Copy(f *File) error {
+	r, err := f.OpenRaw()
+	if err != nil {
+		return err
+	}
+	fw, err := w.CreateRaw(&f.FileHeader)
+	if err != nil {
+		return err
+	}
+	_, err = io.Copy(fw, r)
+	return err
+}
+
 // RegisterCompressor registers or overrides a custom compressor for a specific
-// method ID. If a compressor for a given method is not found, Writer will
+// method ID. If a compressor for a given method is not found, [Writer] will
 // default to looking up the compressor at the package level.
 func (w *Writer) RegisterCompressor(method uint16, comp Compressor) {
 	if w.compressors == nil {
@@ -557,6 +496,44 @@ func (w *Writer) RegisterCompressor(method uint16, comp Compressor) {
 	w.compressors[method] = comp
 }
 
+// AddFS adds the files from fs.FS to the archive.
+// It walks the directory tree starting at the root of the filesystem
+// adding each file to the zip using deflate while maintaining the directory structure.
+func (w *Writer) AddFS(fsys fs.FS) error {
+	return fs.WalkDir(fsys, ".", func(name string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		if d.IsDir() {
+			return nil
+		}
+		info, err := d.Info()
+		if err != nil {
+			return err
+		}
+		if !info.Mode().IsRegular() {
+			return errors.New("zip: cannot add non-regular file")
+		}
+		h, err := FileInfoHeader(info)
+		if err != nil {
+			return err
+		}
+		h.Name = name
+		h.Method = Deflate
+		fw, err := w.CreateHeader(h)
+		if err != nil {
+			return err
+		}
+		f, err := fsys.Open(name)
+		if err != nil {
+			return err
+		}
+		defer f.Close()
+		_, err = io.Copy(fw, f)
+		return err
+	})
+}
+
 func (w *Writer) compressor(method uint16) Compressor {
 	comp := w.compressors[method]
 	if comp == nil {
@@ -588,19 +565,21 @@ func (w *fileWriter) Write(p []byte) (int, error) {
 	if w.closed {
 		return 0, errors.New("zip: write to closed file")
 	}
+	if w.raw {
+		return w.zipw.Write(p)
+	}
 	w.crc32.Write(p)
 	return w.rawCount.Write(p)
 }
 
-func (w *fileWriter) Closed() bool {
-	return w.closed
-}
-
-func (w *fileWriter) Close() error {
+func (w *fileWriter) close() error {
 	if w.closed {
 		return errors.New("zip: file closed twice")
 	}
 	w.closed = true
+	if w.raw {
+		return w.writeDataDescriptor()
+	}
 	if err := w.comp.Close(); err != nil {
 		return err
 	}
@@ -620,86 +599,33 @@ func (w *fileWriter) Close() error {
 		fh.UncompressedSize = uint32(fh.UncompressedSize64)
 	}
 
-	// Write data descriptor. This is more complicated than one would
-	// think, see e.g. comments in zipfile.c:putextended() and
-	// http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7073588.
-	// The approach here is to write 8 byte sizes if needed without
-	// adding a zip64 extra in the local header (too late anyway).
-	var buf []byte
-	if fh.isZip64() {
-		buf = make([]byte, dataDescriptor64Len)
-	} else {
-		buf = make([]byte, dataDescriptorLen)
-	}
-	b := writeBuf(buf)
-	b.uint32(dataDescriptorSignature) // de-facto standard, required by OS X
-	b.uint32(fh.CRC32)
-	if fh.isZip64() {
-		b.uint64(fh.CompressedSize64)
-		b.uint64(fh.UncompressedSize64)
-	} else {
-		b.uint32(fh.CompressedSize)
-		b.uint32(fh.UncompressedSize)
-	}
-	_, err := w.zipw.Write(buf)
-	return err
-}
-
-type rawWriter struct {
-	*header
-	zipw     io.Writer
-	rawCount *countWriter
-	closed   bool
-}
-
-func (w *rawWriter) Write(p []byte) (int, error) {
-	if w.closed {
-		return 0, errors.New("zip: write to closed file")
-	}
-	return w.rawCount.Write(p)
+	return w.writeDataDescriptor()
 }
 
-func (w *rawWriter) Closed() bool {
-	return w.closed
-}
-
-func (w *rawWriter) Close() error {
-	if w.closed {
-		return errors.New("zip: file closed twice")
-	}
-	w.closed = true
-	fh := w.FileHeader
-	fh.CompressedSize64 = uint64(w.rawCount.count)
-
-	if fh.isZip64() {
-		fh.CompressedSize = uint32max
-		fh.UncompressedSize = uint32max
-		fh.ReaderVersion = zipVersion45 // requires 4.5 - File uses ZIP64 format extensions
-	} else {
-		fh.CompressedSize = uint32(fh.CompressedSize64)
-		fh.UncompressedSize = uint32(fh.UncompressedSize64)
+func (w *fileWriter) writeDataDescriptor() error {
+	if !w.hasDataDescriptor() {
+		return nil
 	}
-
 	// Write data descriptor. This is more complicated than one would
 	// think, see e.g. comments in zipfile.c:putextended() and
 	// http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7073588.
 	// The approach here is to write 8 byte sizes if needed without
 	// adding a zip64 extra in the local header (too late anyway).
 	var buf []byte
-	if fh.isZip64() {
+	if w.isZip64() {
 		buf = make([]byte, dataDescriptor64Len)
 	} else {
 		buf = make([]byte, dataDescriptorLen)
 	}
 	b := writeBuf(buf)
 	b.uint32(dataDescriptorSignature) // de-facto standard, required by OS X
-	b.uint32(fh.CRC32)
-	if fh.isZip64() {
-		b.uint64(fh.CompressedSize64)
-		b.uint64(fh.UncompressedSize64)
+	b.uint32(w.CRC32)
+	if w.isZip64() {
+		b.uint64(w.CompressedSize64)
+		b.uint64(w.UncompressedSize64)
 	} else {
-		b.uint32(fh.CompressedSize)
-		b.uint32(fh.UncompressedSize)
+		b.uint32(w.CompressedSize)
+		b.uint32(w.UncompressedSize)
 	}
 	_, err := w.zipw.Write(buf)
 	return err
diff --git a/vendor/github.com/klauspost/compress/zlib/reader.go b/vendor/github.com/klauspost/compress/zlib/reader.go
new file mode 100644
index 0000000000..cb652b9089
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zlib/reader.go
@@ -0,0 +1,187 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package zlib implements reading and writing of zlib format compressed data,
+as specified in RFC 1950.
+
+The implementation provides filters that uncompress during reading
+and compress during writing.  For example, to write compressed data
+to a buffer:
+
+	var b bytes.Buffer
+	w := zlib.NewWriter(&b)
+	w.Write([]byte("hello, world\n"))
+	w.Close()
+
+and to read that data back:
+
+	r, err := zlib.NewReader(&b)
+	io.Copy(os.Stdout, r)
+	r.Close()
+*/
+package zlib
+
+import (
+	"bufio"
+	"compress/zlib"
+	"encoding/binary"
+	"hash"
+	"hash/adler32"
+	"io"
+
+	"github.com/klauspost/compress/flate"
+)
+
+const (
+	zlibDeflate   = 8
+	zlibMaxWindow = 7
+)
+
+var (
+	// ErrChecksum is returned when reading ZLIB data that has an invalid checksum.
+	ErrChecksum = zlib.ErrChecksum
+	// ErrDictionary is returned when reading ZLIB data that has an invalid dictionary.
+	ErrDictionary = zlib.ErrDictionary
+	// ErrHeader is returned when reading ZLIB data that has an invalid header.
+	ErrHeader = zlib.ErrHeader
+)
+
+type reader struct {
+	r            flate.Reader
+	decompressor io.ReadCloser
+	digest       hash.Hash32
+	err          error
+	scratch      [4]byte
+}
+
+// Resetter resets a ReadCloser returned by [NewReader] or [NewReaderDict]
+// to switch to a new underlying Reader. This permits reusing a ReadCloser
+// instead of allocating a new one.
+type Resetter interface {
+	// Reset discards any buffered data and resets the Resetter as if it was
+	// newly initialized with the given reader.
+	Reset(r io.Reader, dict []byte) error
+}
+
+// NewReader creates a new ReadCloser.
+// Reads from the returned ReadCloser read and decompress data from r.
+// If r does not implement [io.ByteReader], the decompressor may read more
+// data than necessary from r.
+// It is the caller's responsibility to call Close on the ReadCloser when done.
+//
+// The [io.ReadCloser] returned by NewReader also implements [Resetter].
+func NewReader(r io.Reader) (io.ReadCloser, error) {
+	return NewReaderDict(r, nil)
+}
+
+// NewReaderDict is like [NewReader] but uses a preset dictionary.
+// NewReaderDict ignores the dictionary if the compressed data does not refer to it.
+// If the compressed data refers to a different dictionary, NewReaderDict returns [ErrDictionary].
+//
+// The ReadCloser returned by NewReaderDict also implements [Resetter].
+func NewReaderDict(r io.Reader, dict []byte) (io.ReadCloser, error) {
+	z := new(reader)
+	err := z.Reset(r, dict)
+	if err != nil {
+		return nil, err
+	}
+	return z, nil
+}
+
+func (z *reader) Read(p []byte) (int, error) {
+	if z.err != nil {
+		return 0, z.err
+	}
+
+	var n int
+	n, z.err = z.decompressor.Read(p)
+	z.digest.Write(p[0:n])
+	if z.err != io.EOF {
+		// In the normal case we return here.
+		return n, z.err
+	}
+
+	// Finished file; check checksum.
+	if _, err := io.ReadFull(z.r, z.scratch[0:4]); err != nil {
+		if err == io.EOF {
+			err = io.ErrUnexpectedEOF
+		}
+		z.err = err
+		return n, z.err
+	}
+	// ZLIB (RFC 1950) is big-endian, unlike GZIP (RFC 1952).
+	checksum := binary.BigEndian.Uint32(z.scratch[:4])
+	if checksum != z.digest.Sum32() {
+		z.err = ErrChecksum
+		return n, z.err
+	}
+	return n, io.EOF
+}
+
+// Calling Close does not close the wrapped [io.Reader] originally passed to [NewReader].
+// In order for the ZLIB checksum to be verified, the reader must be
+// fully consumed until the [io.EOF].
+func (z *reader) Close() error {
+	if z.err != nil && z.err != io.EOF {
+		return z.err
+	}
+	z.err = z.decompressor.Close()
+	return z.err
+}
+
+func (z *reader) Reset(r io.Reader, dict []byte) error {
+	*z = reader{decompressor: z.decompressor}
+	if fr, ok := r.(flate.Reader); ok {
+		z.r = fr
+	} else {
+		z.r = bufio.NewReader(r)
+	}
+
+	// Read the header (RFC 1950 section 2.2.).
+	_, z.err = io.ReadFull(z.r, z.scratch[0:2])
+	if z.err != nil {
+		if z.err == io.EOF {
+			z.err = io.ErrUnexpectedEOF
+		}
+		return z.err
+	}
+	h := binary.BigEndian.Uint16(z.scratch[:2])
+	if (z.scratch[0]&0x0f != zlibDeflate) || (z.scratch[0]>>4 > zlibMaxWindow) || (h%31 != 0) {
+		z.err = ErrHeader
+		return z.err
+	}
+	haveDict := z.scratch[1]&0x20 != 0
+	if haveDict {
+		_, z.err = io.ReadFull(z.r, z.scratch[0:4])
+		if z.err != nil {
+			if z.err == io.EOF {
+				z.err = io.ErrUnexpectedEOF
+			}
+			return z.err
+		}
+		checksum := binary.BigEndian.Uint32(z.scratch[:4])
+		if checksum != adler32.Checksum(dict) {
+			z.err = ErrDictionary
+			return z.err
+		}
+	}
+
+	if z.decompressor == nil {
+		if haveDict {
+			z.decompressor = flate.NewReaderDict(z.r, dict)
+		} else {
+			z.decompressor = flate.NewReader(z.r)
+		}
+	} else {
+		z.decompressor.(flate.Resetter).Reset(z.r, dict)
+	}
+
+	if z.digest != nil {
+		z.digest.Reset()
+	} else {
+		z.digest = adler32.New()
+	}
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zlib/writer.go b/vendor/github.com/klauspost/compress/zlib/writer.go
new file mode 100644
index 0000000000..cab9ef3eb0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zlib/writer.go
@@ -0,0 +1,195 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package zlib
+
+import (
+	"encoding/binary"
+	"fmt"
+	"hash"
+	"hash/adler32"
+	"io"
+
+	"github.com/klauspost/compress/flate"
+)
+
+// These constants are copied from the flate package, so that code that imports
+// "compress/zlib" does not also have to import "compress/flate".
+const (
+	NoCompression       = flate.NoCompression
+	BestSpeed           = flate.BestSpeed
+	BestCompression     = flate.BestCompression
+	DefaultCompression  = flate.DefaultCompression
+	ConstantCompression = flate.ConstantCompression // Deprecated: Use HuffmanOnly.
+	HuffmanOnly         = flate.HuffmanOnly
+)
+
+// A Writer takes data written to it and writes the compressed
+// form of that data to an underlying writer (see NewWriter).
+type Writer struct {
+	w           io.Writer
+	level       int
+	dict        []byte
+	compressor  *flate.Writer
+	digest      hash.Hash32
+	err         error
+	scratch     [4]byte
+	wroteHeader bool
+}
+
+// NewWriter creates a new Writer.
+// Writes to the returned Writer are compressed and written to w.
+//
+// It is the caller's responsibility to call Close on the Writer when done.
+// Writes may be buffered and not flushed until Close.
+func NewWriter(w io.Writer) *Writer {
+	z, _ := NewWriterLevelDict(w, DefaultCompression, nil)
+	return z
+}
+
+// NewWriterLevel is like NewWriter but specifies the compression level instead
+// of assuming DefaultCompression.
+//
+// The compression level can be DefaultCompression, NoCompression, HuffmanOnly
+// or any integer value between BestSpeed and BestCompression inclusive.
+// The error returned will be nil if the level is valid.
+func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
+	return NewWriterLevelDict(w, level, nil)
+}
+
+// NewWriterLevelDict is like NewWriterLevel but specifies a dictionary to
+// compress with.
+//
+// The dictionary may be nil. If not, its contents should not be modified until
+// the Writer is closed.
+func NewWriterLevelDict(w io.Writer, level int, dict []byte) (*Writer, error) {
+	if level < HuffmanOnly || level > BestCompression {
+		return nil, fmt.Errorf("zlib: invalid compression level: %d", level)
+	}
+	return &Writer{
+		w:     w,
+		level: level,
+		dict:  dict,
+	}, nil
+}
+
+// Reset clears the state of the Writer z such that it is equivalent to its
+// initial state from NewWriterLevel or NewWriterLevelDict, but instead writing
+// to w.
+func (z *Writer) Reset(w io.Writer) {
+	z.w = w
+	// z.level and z.dict left unchanged.
+	if z.compressor != nil {
+		z.compressor.Reset(w)
+	}
+	if z.digest != nil {
+		z.digest.Reset()
+	}
+	z.err = nil
+	z.scratch = [4]byte{}
+	z.wroteHeader = false
+}
+
+// writeHeader writes the ZLIB header.
+func (z *Writer) writeHeader() (err error) {
+	z.wroteHeader = true
+	// ZLIB has a two-byte header (as documented in RFC 1950).
+	// The first four bits is the CINFO (compression info), which is 7 for the default deflate window size.
+	// The next four bits is the CM (compression method), which is 8 for deflate.
+	z.scratch[0] = 0x78
+	// The next two bits is the FLEVEL (compression level). The four values are:
+	// 0=fastest, 1=fast, 2=default, 3=best.
+	// The next bit, FDICT, is set if a dictionary is given.
+	// The final five FCHECK bits form a mod-31 checksum.
+	switch z.level {
+	case -2, 0, 1:
+		z.scratch[1] = 0 << 6
+	case 2, 3, 4, 5:
+		z.scratch[1] = 1 << 6
+	case 6, -1:
+		z.scratch[1] = 2 << 6
+	case 7, 8, 9:
+		z.scratch[1] = 3 << 6
+	default:
+		panic("unreachable")
+	}
+	if z.dict != nil {
+		z.scratch[1] |= 1 << 5
+	}
+	z.scratch[1] += uint8(31 - binary.BigEndian.Uint16(z.scratch[:2])%31)
+	if _, err = z.w.Write(z.scratch[0:2]); err != nil {
+		return err
+	}
+	if z.dict != nil {
+		// The next four bytes are the Adler-32 checksum of the dictionary.
+		binary.BigEndian.PutUint32(z.scratch[:], adler32.Checksum(z.dict))
+		if _, err = z.w.Write(z.scratch[0:4]); err != nil {
+			return err
+		}
+	}
+	if z.compressor == nil {
+		// Initialize deflater unless the Writer is being reused
+		// after a Reset call.
+		z.compressor, err = flate.NewWriterDict(z.w, z.level, z.dict)
+		if err != nil {
+			return err
+		}
+		z.digest = adler32.New()
+	}
+	return nil
+}
+
+// Write writes a compressed form of p to the underlying io.Writer. The
+// compressed bytes are not necessarily flushed until the Writer is closed or
+// explicitly flushed.
+func (z *Writer) Write(p []byte) (n int, err error) {
+	if !z.wroteHeader {
+		z.err = z.writeHeader()
+	}
+	if z.err != nil {
+		return 0, z.err
+	}
+	if len(p) == 0 {
+		return 0, nil
+	}
+	n, err = z.compressor.Write(p)
+	if err != nil {
+		z.err = err
+		return
+	}
+	z.digest.Write(p)
+	return
+}
+
+// Flush flushes the Writer to its underlying io.Writer.
+func (z *Writer) Flush() error {
+	if !z.wroteHeader {
+		z.err = z.writeHeader()
+	}
+	if z.err != nil {
+		return z.err
+	}
+	z.err = z.compressor.Flush()
+	return z.err
+}
+
+// Close closes the Writer, flushing any unwritten data to the underlying
+// io.Writer, but does not close the underlying io.Writer.
+func (z *Writer) Close() error {
+	if !z.wroteHeader {
+		z.err = z.writeHeader()
+	}
+	if z.err != nil {
+		return z.err
+	}
+	z.err = z.compressor.Close()
+	if z.err != nil {
+		return z.err
+	}
+	checksum := z.digest.Sum32()
+	// ZLIB (RFC 1950) is big-endian, unlike GZIP (RFC 1952).
+	binary.BigEndian.PutUint32(z.scratch[:], checksum)
+	_, z.err = z.w.Write(z.scratch[0:4])
+	return z.err
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index 7680bfe1dd..92e2347bbc 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -12,12 +12,13 @@ The `zstd` package is provided as open source software using a Go standard licen
 
 Currently the package is heavily optimized for 64 bit processors and will be significantly slower on 32 bit processors.
 
+For seekable zstd streams, see [this excellent package](https://github.com/SaveTheRbtz/zstd-seekable-format-go).
+
 ## Installation
 
 Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`.
 
-Godoc Documentation: https://godoc.org/github.com/klauspost/compress/zstd
-
+[![Go Reference](https://pkg.go.dev/badge/github.com/klauspost/compress/zstd.svg)](https://pkg.go.dev/github.com/klauspost/compress/zstd)
 
 ## Compressor
 
@@ -79,6 +80,9 @@ of a stream. This is independent of the `WithEncoderConcurrency(n)`, but that is
 in the future. So if you want to limit concurrency for future updates, specify the concurrency
 you would like.
 
+If you would like stream encoding to be done without spawning async goroutines, use `WithEncoderConcurrency(1)`
+which will compress input as each block is completed, blocking on writes until each has completed.
+
 You can specify your desired compression level using `WithEncoderLevel()` option. Currently only pre-defined 
 compression settings can be specified.
 
@@ -105,7 +109,8 @@ and seems to ignore concatenated streams, even though [it is part of the spec](h
 For compressing small blocks, the returned encoder has a function called `EncodeAll(src, dst []byte) []byte`.
 
 `EncodeAll` will encode all input in src and append it to dst.
-This function can be called concurrently, but each call will only run on a single goroutine.
+This function can be called concurrently. 
+Each call will only run on a same goroutine as the caller.
 
 Encoded blocks can be concatenated and the result will be the combined input stream.
 Data compressed with EncodeAll can be decoded with the Decoder, using either a stream or `DecodeAll`.
@@ -150,10 +155,10 @@ http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip
 
 This package:
 file    out     level   insize      outsize     millis  mb/s
-silesia.tar zskp    1   211947520   73101992    643     313.87
-silesia.tar zskp    2   211947520   67504318    969     208.38
-silesia.tar zskp    3   211947520   65177448    1899    106.44
-silesia.tar zskp    4   211947520   61381950    8115    24.91
+silesia.tar zskp    1   211947520   73821326    634     318.47
+silesia.tar zskp    2   211947520   67655404    1508    133.96
+silesia.tar zskp    3   211947520   64746933    3000    67.37
+silesia.tar zskp    4   211947520   60073508    16926   11.94
 
 cgo zstd:
 silesia.tar zstd    1   211947520   73605392    543     371.56
@@ -162,89 +167,99 @@ silesia.tar zstd    6   211947520   62916450    1913    105.66
 silesia.tar zstd    9   211947520   60212393    5063    39.92
 
 gzip, stdlib/this package:
-silesia.tar gzstd   1   211947520   80007735    1654    122.21
-silesia.tar gzkp    1   211947520   80369488    1168    173.06
+silesia.tar gzstd   1   211947520   80007735    1498    134.87
+silesia.tar gzkp    1   211947520   80088272    1009    200.31
 
 GOB stream of binary data. Highly compressible.
 https://files.klauspost.com/compress/gob-stream.7z
 
 file        out     level   insize  outsize     millis  mb/s
-gob-stream  zskp    1   1911399616  235022249   3088    590.30
-gob-stream  zskp    2   1911399616  205669791   3786    481.34
-gob-stream  zskp    3   1911399616  185792019   9324    195.48
-gob-stream  zskp    4   1911399616  171537212   32113   56.76
+gob-stream  zskp    1   1911399616  233948096   3230    564.34
+gob-stream  zskp    2   1911399616  203997694   4997    364.73
+gob-stream  zskp    3   1911399616  173526523   13435   135.68
+gob-stream  zskp    4   1911399616  162195235   47559   38.33
+
 gob-stream  zstd    1   1911399616  249810424   2637    691.26
 gob-stream  zstd    3   1911399616  208192146   3490    522.31
 gob-stream  zstd    6   1911399616  193632038   6687    272.56
 gob-stream  zstd    9   1911399616  177620386   16175   112.70
-gob-stream  gzstd   1   1911399616  357382641   10251   177.82
-gob-stream  gzkp    1   1911399616  362156523   5695    320.08
+
+gob-stream  gzstd   1   1911399616  357382013   9046    201.49
+gob-stream  gzkp    1   1911399616  359136669   4885    373.08
 
 The test data for the Large Text Compression Benchmark is the first
 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
 http://mattmahoney.net/dc/textdata.html
 
 file    out level   insize      outsize     millis  mb/s
-enwik9  zskp    1   1000000000  343848582   3609    264.18
-enwik9  zskp    2   1000000000  317276632   5746    165.97
-enwik9  zskp    3   1000000000  294540704   11725   81.34
-enwik9  zskp    4   1000000000  276609671   44029   21.66
+enwik9  zskp    1   1000000000  343833605   3687    258.64
+enwik9  zskp    2   1000000000  317001237   7672    124.29
+enwik9  zskp    3   1000000000  291915823   15923   59.89
+enwik9  zskp    4   1000000000  261710291   77697   12.27
+
 enwik9  zstd    1   1000000000  358072021   3110    306.65
 enwik9  zstd    3   1000000000  313734672   4784    199.35
 enwik9  zstd    6   1000000000  295138875   10290   92.68
 enwik9  zstd    9   1000000000  278348700   28549   33.40
-enwik9  gzstd   1   1000000000  382578136   9604    99.30
-enwik9  gzkp    1   1000000000  383825945   6544    145.73
+
+enwik9  gzstd   1   1000000000  382578136   8608    110.78
+enwik9  gzkp    1   1000000000  382781160   5628    169.45
 
 Highly compressible JSON file.
 https://files.klauspost.com/compress/github-june-2days-2019.json.zst
 
 file                        out level   insize      outsize     millis  mb/s
-github-june-2days-2019.json zskp    1   6273951764  699045015   10620   563.40
-github-june-2days-2019.json zskp    2   6273951764  617881763   11687   511.96
-github-june-2days-2019.json zskp    3   6273951764  537511906   29252   204.54
-github-june-2days-2019.json zskp    4   6273951764  512796117   97791   61.18
+github-june-2days-2019.json zskp    1   6273951764  697439532   9789    611.17
+github-june-2days-2019.json zskp    2   6273951764  610876538   18553   322.49
+github-june-2days-2019.json zskp    3   6273951764  517662858   44186   135.41
+github-june-2days-2019.json zskp    4   6273951764  464617114   165373  36.18
+
 github-june-2days-2019.json zstd    1   6273951764  766284037   8450    708.00
 github-june-2days-2019.json zstd    3   6273951764  661889476   10927   547.57
 github-june-2days-2019.json zstd    6   6273951764  642756859   22996   260.18
 github-june-2days-2019.json zstd    9   6273951764  601974523   52413   114.16
-github-june-2days-2019.json gzstd   1   6273951764  1164400847  29948   199.79
-github-june-2days-2019.json gzkp    1   6273951764  1128755542  19236   311.03
+
+github-june-2days-2019.json gzstd   1   6273951764  1164397768  26793   223.32
+github-june-2days-2019.json gzkp    1   6273951764  1120631856  17693   338.16
 
 VM Image, Linux mint with a few installed applications:
 https://files.klauspost.com/compress/rawstudio-mint14.7z
 
 file                    out level   insize      outsize     millis  mb/s
-rawstudio-mint14.tar    zskp    1   8558382592  3667489370  20210   403.84
-rawstudio-mint14.tar    zskp    2   8558382592  3364592300  31873   256.07
-rawstudio-mint14.tar    zskp    3   8558382592  3224594213  71751   113.75
-rawstudio-mint14.tar    zskp    4   8558382592  3027332295  486243  16.79
+rawstudio-mint14.tar    zskp    1   8558382592  3718400221  18206   448.29
+rawstudio-mint14.tar    zskp    2   8558382592  3326118337  37074   220.15
+rawstudio-mint14.tar    zskp    3   8558382592  3163842361  87306   93.49
+rawstudio-mint14.tar    zskp    4   8558382592  2970480650  783862  10.41
+
 rawstudio-mint14.tar    zstd    1   8558382592  3609250104  17136   476.27
 rawstudio-mint14.tar    zstd    3   8558382592  3341679997  29262   278.92
 rawstudio-mint14.tar    zstd    6   8558382592  3235846406  77904   104.77
 rawstudio-mint14.tar    zstd    9   8558382592  3160778861  140946  57.91
-rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  57722   141.40
-rawstudio-mint14.tar    gzkp    1   8558382592  3970463184  41749   195.49
+
+rawstudio-mint14.tar    gzstd   1   8558382592  3926234992  51345   158.96
+rawstudio-mint14.tar    gzkp    1   8558382592  3960117298  36722   222.26
 
 CSV data:
 https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
 
 file                    out level   insize      outsize     millis  mb/s
-nyc-taxi-data-10M.csv   zskp    1   3325605752  641339945   8925    355.35
-nyc-taxi-data-10M.csv   zskp    2   3325605752  591748091   11268   281.44
-nyc-taxi-data-10M.csv   zskp    3   3325605752  538490114   19880   159.53
-nyc-taxi-data-10M.csv   zskp    4   3325605752  495986829   89368   35.49
+nyc-taxi-data-10M.csv   zskp    1   3325605752  641319332   9462    335.17
+nyc-taxi-data-10M.csv   zskp    2   3325605752  588976126   17570   180.50
+nyc-taxi-data-10M.csv   zskp    3   3325605752  529329260   32432   97.79
+nyc-taxi-data-10M.csv   zskp    4   3325605752  474949772   138025  22.98
+
 nyc-taxi-data-10M.csv   zstd    1   3325605752  687399637   8233    385.18
 nyc-taxi-data-10M.csv   zstd    3   3325605752  598514411   10065   315.07
 nyc-taxi-data-10M.csv   zstd    6   3325605752  570522953   20038   158.27
 nyc-taxi-data-10M.csv   zstd    9   3325605752  517554797   64565   49.12
-nyc-taxi-data-10M.csv   gzstd   1   3325605752  928656485   23876   132.83
-nyc-taxi-data-10M.csv   gzkp    1   3325605752  924718719   16388   193.53
+
+nyc-taxi-data-10M.csv   gzstd   1   3325605752  928654908   21270   149.11
+nyc-taxi-data-10M.csv   gzkp    1   3325605752  922273214   13929   227.68
 ```
 
 ## Decompressor
 
-Staus: STABLE - there may still be subtle bugs, but a wide variety of content has been tested.
+Status: STABLE - there may still be subtle bugs, but a wide variety of content has been tested.
 
 This library is being continuously [fuzz-tested](https://github.com/klauspost/compress-fuzz),
 kindly supplied by [fuzzit.dev](https://fuzzit.dev/). 
@@ -274,8 +289,13 @@ func Decompress(in io.Reader, out io.Writer) error {
 }
 ```
 
-It is important to use the "Close" function when you no longer need the Reader to stop running goroutines. 
-See "Allocation-less operation" below.
+It is important to use the "Close" function when you no longer need the Reader to stop running goroutines, 
+when running with default settings.
+Goroutines will exit once an error has been returned, including `io.EOF` at the end of a stream.
+
+Streams are decoded concurrently in 4 asynchronous stages to give the best possible throughput.
+However, if you prefer synchronous decompression, use `WithDecoderConcurrency(1)` which will decompress data 
+as it is being requested only.
 
 For decoding buffers, it could look something like this:
 
@@ -284,7 +304,7 @@ import "github.com/klauspost/compress/zstd"
 
 // Create a reader that caches decompressors.
 // For this operation type we supply a nil Reader.
-var decoder, _ = zstd.NewReader(nil)
+var decoder, _ = zstd.NewReader(nil, zstd.WithDecoderConcurrency(0))
 
 // Decompress a buffer. We don't supply a destination buffer,
 // so it will be allocated by the decoder.
@@ -294,9 +314,12 @@ func Decompress(src []byte) ([]byte, error) {
 ```
 
 Both of these cases should provide the functionality needed. 
-The decoder can be used for *concurrent* decompression of multiple buffers. 
+The decoder can be used for *concurrent* decompression of multiple buffers.
+By default 4 decompressors will be created. 
+
 It will only allow a certain number of concurrent operations to run. 
-To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.   
+To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.
+It is possible to use `WithDecoderConcurrency(0)` to create GOMAXPROCS decoders.
 
 ### Dictionaries
 
@@ -348,70 +371,71 @@ In this case no unneeded allocations should be made.
 The buffer decoder does everything on the same goroutine and does nothing concurrently.
 It can however decode several buffers concurrently. Use `WithDecoderConcurrency(n)` to limit that.
 
-The stream decoder operates on
+The stream decoder will create goroutines that:
 
-* One goroutine reads input and splits the input to several block decoders.
-* A number of decoders will decode blocks.
-* A goroutine coordinates these blocks and sends history from one to the next.
+1) Reads input and splits the input into blocks.
+2) Decompression of literals.
+3) Decompression of sequences.
+4) Reconstruction of output stream.
 
 So effectively this also means the decoder will "read ahead" and prepare data to always be available for output.
 
+The concurrency level will, for streams, determine how many blocks ahead the compression will start.
+
 Since "blocks" are quite dependent on the output of the previous block stream decoding will only have limited concurrency.
 
-In practice this means that concurrency is often limited to utilizing about 2 cores effectively.
- 
- 
+In practice this means that concurrency is often limited to utilizing about 3 cores effectively.
+  
 ### Benchmarks
 
-These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
-
 The first two are streaming decodes and the last are smaller inputs. 
- 
+
+Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used.
+
 ```
-BenchmarkDecoderSilesia-8                          3     385000067 ns/op     550.51 MB/s        5498 B/op          8 allocs/op
-BenchmarkDecoderSilesiaCgo-8                       6     197666567 ns/op    1072.25 MB/s      270672 B/op          8 allocs/op
-
-BenchmarkDecoderEnwik9-8                           1    2027001600 ns/op     493.34 MB/s       10496 B/op         18 allocs/op
-BenchmarkDecoderEnwik9Cgo-8                        2     979499200 ns/op    1020.93 MB/s      270672 B/op          8 allocs/op
-
-Concurrent performance:
-
-BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16                28915         42469 ns/op    4340.07 MB/s         114 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16           116505          9965 ns/op    11900.16 MB/s         16 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16              8952        134272 ns/op    3588.70 MB/s         915 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16               11820        102538 ns/op    4161.90 MB/s         594 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16             34782         34184 ns/op    3661.88 MB/s          60 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16              27712         43447 ns/op    3500.58 MB/s          99 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16                 62826         18750 ns/op    21845.10 MB/s        104 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16          631545          1794 ns/op    57078.74 MB/s          2 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16         1690140           712 ns/op    172938.13 MB/s         1 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16                 10432        113593 ns/op    6180.73 MB/s        1143 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html.zst-16                    113206         10671 ns/op    9596.27 MB/s          15 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16          1530615           779 ns/op    5229.49 MB/s           0 B/op          0 allocs/op
-
-BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16             65217         16192 ns/op    11383.34 MB/s         46 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16        292671          4039 ns/op    29363.19 MB/s          6 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16          26314         46021 ns/op    10470.43 MB/s        293 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16            33897         34900 ns/op    12227.96 MB/s        205 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16         104348         11433 ns/op    10949.01 MB/s         20 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16           75949         15510 ns/op    9805.60 MB/s          32 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16             173910          6756 ns/op    60624.29 MB/s         37 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16       923076          1339 ns/op    76474.87 MB/s          1 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16       922920          1351 ns/op    91102.57 MB/s          2 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16              27649         43618 ns/op    16096.19 MB/s        407 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16                 279073          4160 ns/op    24614.18 MB/s          6 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16        749938          1579 ns/op    2581.71 MB/s           0 B/op          0 allocs/op
+BenchmarkDecoderSilesia-32    	                   5	 206878840 ns/op	1024.50 MB/s	   49808 B/op	      43 allocs/op
+BenchmarkDecoderEnwik9-32                          1	1271809000 ns/op	 786.28 MB/s	   72048 B/op	      52 allocs/op
+
+Concurrent blocks, performance:
+
+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32         	   67356	     17857 ns/op	10321.96 MB/s	        22.48 pct	     102 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32     	  266656	      4421 ns/op	26823.21 MB/s	        11.89 pct	      19 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32      	   20992	     56842 ns/op	8477.17 MB/s	        39.90 pct	     754 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32        	   27456	     43932 ns/op	9714.01 MB/s	        33.27 pct	     524 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32      	   78432	     15047 ns/op	8319.15 MB/s	        40.34 pct	      66 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32       	   65800	     18436 ns/op	8249.63 MB/s	        37.75 pct	      88 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32          	  102993	     11523 ns/op	35546.09 MB/s	         3.637 pct	     143 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32    	 1000000	      1070 ns/op	95720.98 MB/s	        80.53 pct	       3 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32    	  749802	      1752 ns/op	70272.35 MB/s	       100.0 pct	       5 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32          	   22640	     52934 ns/op	13263.37 MB/s	        26.25 pct	    1014 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html.zst-32              	  226412	      5232 ns/op	19572.27 MB/s	        14.49 pct	      20 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32     	  923041	      1276 ns/op	3194.71 MB/s	        31.26 pct	       0 B/op	       0 allocs/op
 ```
 
-This reflects the performance around May 2020, but this may be out of date.
+This reflects the performance around May 2022, but this may be out of date.
+
+## Zstd inside ZIP files
+
+It is possible to use zstandard to compress individual files inside zip archives.
+While this isn't widely supported it can be useful for internal files.
+
+To support the compression and decompression of these files you must register a compressor and decompressor.
+
+It is highly recommended registering the (de)compressors on individual zip Reader/Writer and NOT
+use the global registration functions. The main reason for this is that 2 registrations from 
+different packages will result in a panic.
+
+It is a good idea to only have a single compressor and decompressor, since they can be used for multiple zip
+files concurrently, and using a single instance will allow reusing some resources.
+
+See [this example](https://pkg.go.dev/github.com/klauspost/compress/zstd#example-ZipCompressor) for 
+how to compress and decompress files inside zip archives.
 
 # Contributions
 
 Contributions are always welcome. 
 For new features/fixes, remember to add tests and for performance enhancements include benchmarks.
 
-For sending files for reproducing errors use a service like [goobox](https://goobox.io/#/upload) or similar to share your files.
-
 For general feedback and experience reports, feel free to open an issue or write me on [Twitter](https://twitter.com/sh0dan).
 
 This package includes the excellent [`github.com/cespare/xxhash`](https://github.com/cespare/xxhash) package Copyright (c) 2016 Caleb Spare.
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index 8544585371..25ca983941 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -7,6 +7,7 @@ package zstd
 import (
 	"encoding/binary"
 	"errors"
+	"fmt"
 	"io"
 	"math/bits"
 )
@@ -16,7 +17,6 @@ import (
 // for aligning the input.
 type bitReader struct {
 	in       []byte
-	off      uint   // next byte to read is at in[off - 1]
 	value    uint64 // Maybe use [16]byte, but shifting is awkward.
 	bitsRead uint8
 }
@@ -27,7 +27,6 @@ func (b *bitReader) init(in []byte) error {
 		return errors.New("corrupt stream: too short")
 	}
 	b.in = in
-	b.off = uint(len(in))
 	// The highest bit of the last byte indicates where to start
 	v := in[len(in)-1]
 	if v == 0 {
@@ -50,16 +49,16 @@ func (b *bitReader) getBits(n uint8) int {
 	if n == 0 /*|| b.bitsRead >= 64 */ {
 		return 0
 	}
-	return b.getBitsFast(n)
+	return int(b.get32BitsFast(n))
 }
 
-// getBitsFast requires that at least one bit is requested every time.
+// get32BitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
-func (b *bitReader) getBitsFast(n uint8) int {
+func (b *bitReader) get32BitsFast(n uint8) uint32 {
 	const regMask = 64 - 1
 	v := uint32((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
 	b.bitsRead += n
-	return int(v)
+	return v
 }
 
 // fillFast() will make sure at least 32 bits are available.
@@ -68,21 +67,19 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	// 2 bounds checks.
-	v := b.in[b.off-4:]
-	v = v[:4]
+	v := b.in[len(b.in)-4:]
+	b.in = b.in[:len(b.in)-4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
-	b.off -= 4
 }
 
 // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
 func (b *bitReader) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	v := b.in[len(b.in)-8:]
+	b.in = b.in[:len(b.in)-8]
+	b.value = binary.LittleEndian.Uint64(v)
 	b.bitsRead = 0
-	b.off -= 8
 }
 
 // fill() will make sure at least 32 bits are available.
@@ -90,25 +87,25 @@ func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
-	if b.off >= 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
+	if len(b.in) >= 4 {
+		v := b.in[len(b.in)-4:]
+		b.in = b.in[:len(b.in)-4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
-		b.off -= 4
 		return
 	}
-	for b.off > 0 {
-		b.value = (b.value << 8) | uint64(b.in[b.off-1])
-		b.bitsRead -= 8
-		b.off--
+
+	b.bitsRead -= uint8(8 * len(b.in))
+	for len(b.in) > 0 {
+		b.value = (b.value << 8) | uint64(b.in[len(b.in)-1])
+		b.in = b.in[:len(b.in)-1]
 	}
 }
 
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
+	return len(b.in) == 0 && b.bitsRead >= 64
 }
 
 // overread returns true if more bits have been requested than is on the stream.
@@ -118,13 +115,16 @@ func (b *bitReader) overread() bool {
 
 // remain returns the number of bits remaining.
 func (b *bitReader) remain() uint {
-	return b.off*8 + 64 - uint(b.bitsRead)
+	return 8*uint(len(b.in)) + 64 - uint(b.bitsRead)
 }
 
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReader) close() error {
 	// Release reference.
 	b.in = nil
+	if !b.finished() {
+		return fmt.Errorf("%d extra bits on block, should be 0", b.remain())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
diff --git a/vendor/github.com/klauspost/compress/zstd/bitwriter.go b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
index 303ae90f94..1952f175b0 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
@@ -5,8 +5,6 @@
 
 package zstd
 
-import "fmt"
-
 // bitWriter will write bits.
 // First bit will be LSB of the first byte of output.
 type bitWriter struct {
@@ -38,7 +36,7 @@ func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
 	b.nBits += bits
 }
 
-// addBits32NC will add up to 32 bits.
+// addBits32NC will add up to 31 bits.
 // It will not check if there is space for them,
 // so the caller must ensure that it has flushed recently.
 func (b *bitWriter) addBits32NC(value uint32, bits uint8) {
@@ -46,6 +44,26 @@ func (b *bitWriter) addBits32NC(value uint32, bits uint8) {
 	b.nBits += bits
 }
 
+// addBits64NC will add up to 64 bits.
+// There must be space for 32 bits.
+func (b *bitWriter) addBits64NC(value uint64, bits uint8) {
+	if bits <= 31 {
+		b.addBits32Clean(uint32(value), bits)
+		return
+	}
+	b.addBits32Clean(uint32(value), 32)
+	b.flush32()
+	b.addBits32Clean(uint32(value>>32), bits-32)
+}
+
+// addBits32Clean will add up to 32 bits.
+// It will not check if there is space for them.
+// The input must not contain more bits than specified.
+func (b *bitWriter) addBits32Clean(value uint32, bits uint8) {
+	b.bitContainer |= uint64(value) << (b.nBits & 63)
+	b.nBits += bits
+}
+
 // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
 // It will not check if there is space for them, so the caller must ensure that it has flushed recently.
 func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@@ -53,80 +71,6 @@ func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
 	b.nBits += bits
 }
 
-// flush will flush all pending full bytes.
-// There will be at least 56 bits available for writing when this has been called.
-// Using flush32 is faster, but leaves less space for writing.
-func (b *bitWriter) flush() {
-	v := b.nBits >> 3
-	switch v {
-	case 0:
-	case 1:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-		)
-	case 2:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-		)
-	case 3:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-		)
-	case 4:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-		)
-	case 5:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-		)
-	case 6:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-		)
-	case 7:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-			byte(b.bitContainer>>48),
-		)
-	case 8:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-			byte(b.bitContainer>>48),
-			byte(b.bitContainer>>56),
-		)
-	default:
-		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
-	}
-	b.bitContainer >>= v << 3
-	b.nBits &= 7
-}
-
 // flush32 will flush out, so there are at least 32 bits available for writing.
 func (b *bitWriter) flush32() {
 	if b.nBits < 32 {
@@ -153,12 +97,11 @@ func (b *bitWriter) flushAlign() {
 
 // close will write the alignment bit and write the final byte(s)
 // to the output.
-func (b *bitWriter) close() error {
+func (b *bitWriter) close() {
 	// End mark
 	b.addBits16Clean(1, 1)
 	// flush until next byte.
 	b.flushAlign()
-	return nil
 }
 
 // reset and continue writing by appending to out.
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index b51d922bda..9c28840c3b 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -5,9 +5,14 @@
 package zstd
 
 import (
+	"bytes"
+	"encoding/binary"
 	"errors"
 	"fmt"
+	"hash/crc32"
 	"io"
+	"os"
+	"path/filepath"
 	"sync"
 
 	"github.com/klauspost/compress/huff0"
@@ -38,14 +43,14 @@ const (
 	// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
 	maxCompressedBlockSize = 128 << 10
 
+	compressedBlockOverAlloc    = 16
+	maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc
+
 	// Maximum possible block size (all Raw+Uncompressed).
 	maxBlockSize = (1 << 21) - 1
 
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
-	maxCompressedLiteralSize = 1 << 18
-	maxRLELiteralSize        = 1 << 20
-	maxMatchLen              = 131074
-	maxSequences             = 0x7f00 + 0xffff
+	maxMatchLen  = 131074
+	maxSequences = 0x7f00 + 0xffff
 
 	// We support slightly less than the reference decoder to be able to
 	// use ints on 32 bit archs.
@@ -76,20 +81,28 @@ type blockDec struct {
 	// Window size of the block.
 	WindowSize uint64
 
-	history     chan *history
-	input       chan struct{}
-	result      chan decodeOutput
-	sequenceBuf []seq
-	err         error
-	decWG       sync.WaitGroup
+	err error
+
+	// Check against this crc, if hasCRC is true.
+	checkCRC uint32
+	hasCRC   bool
 
 	// Frame to use for singlethreaded decoding.
 	// Should not be used by the decoder itself since parent may be another frame.
 	localFrame *frameDec
 
+	sequence []seqVals
+
+	async struct {
+		newHist  *history
+		literals []byte
+		seqData  []byte
+		seqSize  int // Size of uncompressed sequences
+		fcs      uint64
+	}
+
 	// Block is RLE, this is the size.
 	RLESize uint32
-	tmp     [4]byte
 
 	Type blockType
 
@@ -109,13 +122,8 @@ func (b *blockDec) String() string {
 
 func newBlockDec(lowMem bool) *blockDec {
 	b := blockDec{
-		lowMem:  lowMem,
-		result:  make(chan decodeOutput, 1),
-		input:   make(chan struct{}, 1),
-		history: make(chan *history, 1),
+		lowMem: lowMem,
 	}
-	b.decWG.Add(1)
-	go b.startDecoder()
 	return &b
 }
 
@@ -123,44 +131,60 @@ func newBlockDec(lowMem bool) *blockDec {
 // Input must be a start of a block and will be at the end of the block when returned.
 func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	b.WindowSize = windowSize
-	tmp := br.readSmall(3)
-	if tmp == nil {
-		if debug {
-			println("Reading block header:", io.ErrUnexpectedEOF)
-		}
-		return io.ErrUnexpectedEOF
+	tmp, err := br.readSmall(3)
+	if err != nil {
+		println("Reading block header:", err)
+		return err
 	}
 	bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
 	b.Last = bh&1 != 0
 	b.Type = blockType((bh >> 1) & 3)
 	// find size.
 	cSize := int(bh >> 3)
-	maxSize := maxBlockSize
+	maxSize := maxCompressedBlockSizeAlloc
 	switch b.Type {
 	case blockTypeReserved:
 		return ErrReservedBlockType
 	case blockTypeRLE:
+		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
+			if debugDecoder {
+				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
+			}
+			return ErrWindowSizeExceeded
+		}
 		b.RLESize = uint32(cSize)
 		if b.lowMem {
 			maxSize = cSize
 		}
 		cSize = 1
 	case blockTypeCompressed:
-		if debug {
+		if debugDecoder {
 			println("Data size on stream:", cSize)
 		}
 		b.RLESize = 0
-		maxSize = maxCompressedBlockSize
+		maxSize = maxCompressedBlockSizeAlloc
 		if windowSize < maxCompressedBlockSize && b.lowMem {
-			maxSize = int(windowSize)
+			maxSize = int(windowSize) + compressedBlockOverAlloc
 		}
 		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
-			if debug {
+			if debugDecoder {
 				printf("compressed block too big: csize:%d block: %+v\n", uint64(cSize), b)
 			}
 			return ErrCompressedSizeTooBig
 		}
+		// Empty compressed blocks must at least be 2 bytes
+		// for Literals_Block_Type and one for Sequences_Section_Header.
+		if cSize < 2 {
+			return ErrBlockTooSmall
+		}
 	case blockTypeRaw:
+		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
+			if debugDecoder {
+				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
+			}
+			return ErrWindowSizeExceeded
+		}
+
 		b.RLESize = 0
 		// We do not need a destination for raw blocks.
 		maxSize = -1
@@ -169,25 +193,25 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	}
 
 	// Read block data.
-	if cap(b.dataStorage) < cSize {
-		if b.lowMem {
-			b.dataStorage = make([]byte, 0, cSize)
+	if _, ok := br.(*byteBuf); !ok && cap(b.dataStorage) < cSize {
+		// byteBuf doesn't need a destination buffer.
+		if b.lowMem || cSize > maxCompressedBlockSize {
+			b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
 		} else {
-			b.dataStorage = make([]byte, 0, maxBlockSize)
+			b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
 		}
 	}
-	if cap(b.dst) <= maxSize {
-		b.dst = make([]byte, 0, maxSize+1)
-	}
-	var err error
 	b.data, err = br.readBig(cSize, b.dataStorage)
 	if err != nil {
-		if debug {
+		if debugDecoder {
 			println("Reading block:", err, "(", cSize, ")", len(b.data))
 			printf("%T", br)
 		}
 		return err
 	}
+	if cap(b.dst) <= maxSize {
+		b.dst = make([]byte, 0, maxSize+1)
+	}
 	return nil
 }
 
@@ -196,85 +220,14 @@ func (b *blockDec) sendErr(err error) {
 	b.Last = true
 	b.Type = blockTypeReserved
 	b.err = err
-	b.input <- struct{}{}
 }
 
 // Close will release resources.
 // Closed blockDec cannot be reset.
 func (b *blockDec) Close() {
-	close(b.input)
-	close(b.history)
-	close(b.result)
-	b.decWG.Wait()
 }
 
-// decodeAsync will prepare decoding the block when it receives input.
-// This will separate output and history.
-func (b *blockDec) startDecoder() {
-	defer b.decWG.Done()
-	for range b.input {
-		//println("blockDec: Got block input")
-		switch b.Type {
-		case blockTypeRLE:
-			if cap(b.dst) < int(b.RLESize) {
-				if b.lowMem {
-					b.dst = make([]byte, b.RLESize)
-				} else {
-					b.dst = make([]byte, maxBlockSize)
-				}
-			}
-			o := decodeOutput{
-				d:   b,
-				b:   b.dst[:b.RLESize],
-				err: nil,
-			}
-			v := b.data[0]
-			for i := range o.b {
-				o.b[i] = v
-			}
-			hist := <-b.history
-			hist.append(o.b)
-			b.result <- o
-		case blockTypeRaw:
-			o := decodeOutput{
-				d:   b,
-				b:   b.data,
-				err: nil,
-			}
-			hist := <-b.history
-			hist.append(o.b)
-			b.result <- o
-		case blockTypeCompressed:
-			b.dst = b.dst[:0]
-			err := b.decodeCompressed(nil)
-			o := decodeOutput{
-				d:   b,
-				b:   b.dst,
-				err: err,
-			}
-			if debug {
-				println("Decompressed to", len(b.dst), "bytes, error:", err)
-			}
-			b.result <- o
-		case blockTypeReserved:
-			// Used for returning errors.
-			<-b.history
-			b.result <- decodeOutput{
-				d:   b,
-				b:   nil,
-				err: b.err,
-			}
-		default:
-			panic("Invalid block type")
-		}
-		if debug {
-			println("blockDec: Finished block")
-		}
-	}
-}
-
-// decodeAsync will prepare decoding the block when it receives the history.
-// If history is provided, it will not fetch it from the channel.
+// decodeBuf
 func (b *blockDec) decodeBuf(hist *history) error {
 	switch b.Type {
 	case blockTypeRLE:
@@ -282,7 +235,7 @@ func (b *blockDec) decodeBuf(hist *history) error {
 			if b.lowMem {
 				b.dst = make([]byte, b.RLESize)
 			} else {
-				b.dst = make([]byte, maxBlockSize)
+				b.dst = make([]byte, maxCompressedBlockSize)
 			}
 		}
 		b.dst = b.dst[:b.RLESize]
@@ -297,14 +250,23 @@ func (b *blockDec) decodeBuf(hist *history) error {
 		return nil
 	case blockTypeCompressed:
 		saved := b.dst
-		b.dst = hist.b
-		hist.b = nil
+		// Append directly to history
+		if hist.ignoreBuffer == 0 {
+			b.dst = hist.b
+			hist.b = nil
+		} else {
+			b.dst = b.dst[:0]
+		}
 		err := b.decodeCompressed(hist)
-		if debug {
+		if debugDecoder {
 			println("Decompressed to total", len(b.dst), "bytes, hash:", xxhash.Sum64(b.dst), "error:", err)
 		}
-		hist.b = b.dst
-		b.dst = saved
+		if hist.ignoreBuffer == 0 {
+			hist.b = b.dst
+			b.dst = saved
+		} else {
+			hist.appendKeep(b.dst)
+		}
 		return err
 	case blockTypeReserved:
 		// Used for returning errors.
@@ -314,30 +276,18 @@ func (b *blockDec) decodeBuf(hist *history) error {
 	}
 }
 
-// decodeCompressed will start decompressing a block.
-// If no history is supplied the decoder will decodeAsync as much as possible
-// before fetching from blockDec.history
-func (b *blockDec) decodeCompressed(hist *history) error {
-	in := b.data
-	delayedHistory := hist == nil
-
-	if delayedHistory {
-		// We must always grab history.
-		defer func() {
-			if hist == nil {
-				<-b.history
-			}
-		}()
-	}
+func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err error) {
 	// There must be at least one byte for Literals_Block_Type and one for Sequences_Section_Header
 	if len(in) < 2 {
-		return ErrBlockTooSmall
+		return in, ErrBlockTooSmall
 	}
+
 	litType := literalsBlockType(in[0] & 3)
 	var litRegenSize int
 	var litCompSize int
 	sizeFormat := (in[0] >> 2) & 3
 	var fourStreams bool
+	var literals []byte
 	switch litType {
 	case literalsBlockRaw, literalsBlockRLE:
 		switch sizeFormat {
@@ -353,7 +303,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			//  Regenerated_Size uses 20 bits (0-1048575). Literals_Section_Header uses 3 bytes.
 			if len(in) < 3 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			litRegenSize = int(in[0]>>4) + (int(in[1]) << 4) + (int(in[2]) << 12)
 			in = in[3:]
@@ -364,7 +314,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			// Both Regenerated_Size and Compressed_Size use 10 bits (0-1023).
 			if len(in) < 3 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12)
 			litRegenSize = int(n & 1023)
@@ -375,7 +325,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			fourStreams = true
 			if len(in) < 4 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20)
 			litRegenSize = int(n & 16383)
@@ -385,7 +335,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			fourStreams = true
 			if len(in) < 5 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20) + (uint64(in[4]) << 28)
 			litRegenSize = int(n & 262143)
@@ -393,16 +343,18 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			in = in[5:]
 		}
 	}
-	if debug {
+	if debugDecoder {
 		println("literals type:", litType, "litRegenSize:", litRegenSize, "litCompSize:", litCompSize, "sizeFormat:", sizeFormat, "4X:", fourStreams)
 	}
-	var literals []byte
-	var huff *huff0.Scratch
+	if litRegenSize > int(b.WindowSize) || litRegenSize > maxCompressedBlockSize {
+		return in, ErrWindowSizeExceeded
+	}
+
 	switch litType {
 	case literalsBlockRaw:
 		if len(in) < litRegenSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litRegenSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		literals = in[:litRegenSize]
 		in = in[litRegenSize:]
@@ -410,19 +362,13 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	case literalsBlockRLE:
 		if len(in) < 1 {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", 1)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		if cap(b.literalBuf) < litRegenSize {
 			if b.lowMem {
-				b.literalBuf = make([]byte, litRegenSize)
+				b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc)
 			} else {
-				if litRegenSize > maxCompressedLiteralSize {
-					// Exceptional
-					b.literalBuf = make([]byte, litRegenSize)
-				} else {
-					b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
-
-				}
+				b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc)
 			}
 		}
 		literals = b.literalBuf[:litRegenSize]
@@ -431,45 +377,82 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			literals[i] = v
 		}
 		in = in[1:]
-		if debug {
+		if debugDecoder {
 			printf("Found %d RLE compressed literals\n", litRegenSize)
 		}
 	case literalsBlockTreeless:
 		if len(in) < litCompSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		// Store compressed literals, so we defer decoding until we get history.
 		literals = in[:litCompSize]
 		in = in[litCompSize:]
-		if debug {
+		if debugDecoder {
 			printf("Found %d compressed literals\n", litCompSize)
 		}
+		huff := hist.huffTree
+		if huff == nil {
+			return in, errors.New("literal block was treeless, but no history was defined")
+		}
+		// Ensure we have space to store it.
+		if cap(b.literalBuf) < litRegenSize {
+			if b.lowMem {
+				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
+			} else {
+				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
+			}
+		}
+		var err error
+		// Use our out buffer.
+		huff.MaxDecodedSize = litRegenSize
+		if fourStreams {
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
+		} else {
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
+		}
+		// Make sure we don't leak our literals buffer
+		if err != nil {
+			println("decompressing literals:", err)
+			return in, err
+		}
+		if len(literals) != litRegenSize {
+			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+		}
+
 	case literalsBlockCompressed:
 		if len(in) < litCompSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		literals = in[:litCompSize]
 		in = in[litCompSize:]
-		huff = huffDecoderPool.Get().(*huff0.Scratch)
-		var err error
 		// Ensure we have space to store it.
 		if cap(b.literalBuf) < litRegenSize {
 			if b.lowMem {
-				b.literalBuf = make([]byte, 0, litRegenSize)
+				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
 			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
 			}
 		}
-		if huff == nil {
-			huff = &huff0.Scratch{}
+		huff := hist.huffTree
+		if huff == nil || (hist.dict != nil && huff == hist.dict.litEnc) {
+			huff = huffDecoderPool.Get().(*huff0.Scratch)
+			if huff == nil {
+				huff = &huff0.Scratch{}
+			}
+		}
+		var err error
+		if debugDecoder {
+			println("huff table input:", len(literals), "CRC:", crc32.ChecksumIEEE(literals))
 		}
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
-			return err
+			return in, err
 		}
+		hist.huffTree = huff
+		huff.MaxDecodedSize = litRegenSize
 		// Use our out buffer.
 		if fourStreams {
 			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
@@ -478,27 +461,63 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 		if err != nil {
 			println("decoding compressed literals:", err)
-			return err
+			return in, err
 		}
 		// Make sure we don't leak our literals buffer
 		if len(literals) != litRegenSize {
-			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
 		}
-		if debug {
+		// Re-cap to get extra size.
+		literals = b.literalBuf[:len(literals)]
+		if debugDecoder {
 			printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
 		}
 	}
+	hist.decoders.literals = literals
+	return in, nil
+}
+
+// decodeCompressed will start decompressing a block.
+func (b *blockDec) decodeCompressed(hist *history) error {
+	in := b.data
+	in, err := b.decodeLiterals(in, hist)
+	if err != nil {
+		return err
+	}
+	err = b.prepareSequences(in, hist)
+	if err != nil {
+		return err
+	}
+	if hist.decoders.nSeqs == 0 {
+		b.dst = append(b.dst, hist.decoders.literals...)
+		return nil
+	}
+	before := len(hist.decoders.out)
+	err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:])
+	if err != nil {
+		return err
+	}
+	if hist.decoders.maxSyncLen > 0 {
+		hist.decoders.maxSyncLen += uint64(before)
+		hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out))
+	}
+	b.dst = hist.decoders.out
+	hist.recentOffsets = hist.decoders.prevOffset
+	return nil
+}
 
+func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
+	if debugDecoder {
+		printf("prepareSequences: %d byte(s) input\n", len(in))
+	}
 	// Decode Sequences
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequences-section
 	if len(in) < 1 {
 		return ErrBlockTooSmall
 	}
+	var nSeqs int
 	seqHeader := in[0]
-	nSeqs := 0
 	switch {
-	case seqHeader == 0:
-		in = in[1:]
 	case seqHeader < 128:
 		nSeqs = int(seqHeader)
 		in = in[1:]
@@ -515,19 +534,16 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		nSeqs = 0x7f00 + int(in[1]) + (int(in[2]) << 8)
 		in = in[3:]
 	}
-	// Allocate sequences
-	if cap(b.sequenceBuf) < nSeqs {
-		if b.lowMem {
-			b.sequenceBuf = make([]seq, nSeqs)
-		} else {
-			// Allocate max
-			b.sequenceBuf = make([]seq, nSeqs, maxSequences)
+	if nSeqs == 0 && len(in) != 0 {
+		// When no sequences, there should not be any more data...
+		if debugDecoder {
+			printf("prepareSequences: 0 sequences, but %d byte(s) left on stream\n", len(in))
 		}
-	} else {
-		// Reuse buffer
-		b.sequenceBuf = b.sequenceBuf[:nSeqs]
+		return ErrUnexpectedBlockSize
 	}
-	var seqs = &sequenceDecs{}
+
+	var seqs = &hist.decoders
+	seqs.nSeqs = nSeqs
 	if nSeqs > 0 {
 		if len(in) < 1 {
 			return ErrBlockTooSmall
@@ -535,12 +551,15 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		br := byteReader{b: in, off: 0}
 		compMode := br.Uint8()
 		br.advance(1)
-		if debug {
+		if debugDecoder {
 			printf("Compression modes: 0b%b", compMode)
 		}
+		if compMode&3 != 0 {
+			return errors.New("corrupt block: reserved bits not zero")
+		}
 		for i := uint(0); i < 3; i++ {
 			mode := seqCompMode((compMode >> (6 - i*2)) & 3)
-			if debug {
+			if debugDecoder {
 				println("Table", tableIndex(i), "is", mode)
 			}
 			var seq *sequenceDec
@@ -556,6 +575,9 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			}
 			switch mode {
 			case compModePredefined:
+				if seq.fse != nil && !seq.fse.preDefined {
+					fseDecoderPool.Put(seq.fse)
+				}
 				seq.fse = &fsePredef[i]
 			case compModeRLE:
 				if br.remain() < 1 {
@@ -563,34 +585,38 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 				}
 				v := br.Uint8()
 				br.advance(1)
-				dec := fseDecoderPool.Get().(*fseDecoder)
+				if seq.fse == nil || seq.fse.preDefined {
+					seq.fse = fseDecoderPool.Get().(*fseDecoder)
+				}
 				symb, err := decSymbolValue(v, symbolTableX[i])
 				if err != nil {
 					printf("RLE Transform table (%v) error: %v", tableIndex(i), err)
 					return err
 				}
-				dec.setRLE(symb)
-				seq.fse = dec
-				if debug {
-					printf("RLE set to %+v, code: %v", symb, v)
+				seq.fse.setRLE(symb)
+				if debugDecoder {
+					printf("RLE set to 0x%x, code: %v", symb, v)
 				}
 			case compModeFSE:
-				println("Reading table for", tableIndex(i))
-				dec := fseDecoderPool.Get().(*fseDecoder)
-				err := dec.readNCount(&br, uint16(maxTableSymbol[i]))
+				if debugDecoder {
+					println("Reading table for", tableIndex(i))
+				}
+				if seq.fse == nil || seq.fse.preDefined {
+					seq.fse = fseDecoderPool.Get().(*fseDecoder)
+				}
+				err := seq.fse.readNCount(&br, uint16(maxTableSymbol[i]))
 				if err != nil {
 					println("Read table error:", err)
 					return err
 				}
-				err = dec.transform(symbolTableX[i])
+				err = seq.fse.transform(symbolTableX[i])
 				if err != nil {
 					println("Transform table error:", err)
 					return err
 				}
-				if debug {
-					println("Read table ok", "symbolLen:", dec.symbolLen)
+				if debugDecoder {
+					println("Read table ok", "symbolLen:", seq.fse.symbolLen)
 				}
-				seq.fse = dec
 			case compModeRepeat:
 				seq.repeat = true
 			}
@@ -600,140 +626,106 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 		in = br.unread()
 	}
-
-	// Wait for history.
-	// All time spent after this is critical since it is strictly sequential.
-	if hist == nil {
-		hist = <-b.history
-		if hist.error {
-			return ErrDecoderClosed
-		}
+	if debugDecoder {
+		println("Literals:", len(seqs.literals), "hash:", xxhash.Sum64(seqs.literals), "and", seqs.nSeqs, "sequences.")
 	}
 
-	// Decode treeless literal block.
-	if litType == literalsBlockTreeless {
-		// TODO: We could send the history early WITHOUT the stream history.
-		//   This would allow decoding treeless literals before the byte history is available.
-		//   Silencia stats: Treeless 4393, with: 32775, total: 37168, 11% treeless.
-		//   So not much obvious gain here.
-
-		if hist.huffTree == nil {
-			return errors.New("literal block was treeless, but no history was defined")
-		}
-		// Ensure we have space to store it.
-		if cap(b.literalBuf) < litRegenSize {
-			if b.lowMem {
-				b.literalBuf = make([]byte, 0, litRegenSize)
-			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
-			}
-		}
-		var err error
-		// Use our out buffer.
-		huff = hist.huffTree
-		if fourStreams {
-			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
-		} else {
-			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
-		}
-		// Make sure we don't leak our literals buffer
-		if err != nil {
-			println("decompressing literals:", err)
-			return err
-		}
-		if len(literals) != litRegenSize {
-			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
-		}
-	} else {
-		if hist.huffTree != nil && huff != nil {
-			if hist.dict == nil || hist.dict.litEnc != hist.huffTree {
-				huffDecoderPool.Put(hist.huffTree)
-			}
-			hist.huffTree = nil
+	if nSeqs == 0 {
+		if len(b.sequence) > 0 {
+			b.sequence = b.sequence[:0]
 		}
+		return nil
 	}
-	if huff != nil {
-		hist.huffTree = huff
+	br := seqs.br
+	if br == nil {
+		br = &bitReader{}
 	}
-	if debug {
-		println("Final literals:", len(literals), "hash:", xxhash.Sum64(literals), "and", nSeqs, "sequences.")
+	if err := br.init(in); err != nil {
+		return err
 	}
 
-	if nSeqs == 0 {
-		// Decompressed content is defined entirely as Literals Section content.
-		b.dst = append(b.dst, literals...)
-		if delayedHistory {
-			hist.append(literals)
+	if err := seqs.initialize(br, hist, b.dst); err != nil {
+		println("initializing sequences:", err)
+		return err
+	}
+	// Extract blocks...
+	if false && hist.dict == nil {
+		fatalErr := func(err error) {
+			if err != nil {
+				panic(err)
+			}
 		}
-		return nil
+		fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
+		var buf bytes.Buffer
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
+		buf.Write(in)
+		os.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
 	}
 
-	seqs, err := seqs.mergeHistory(&hist.decoders)
-	if err != nil {
-		return err
-	}
-	if debug {
-		println("History merged ok")
+	return nil
+}
+
+func (b *blockDec) decodeSequences(hist *history) error {
+	if cap(b.sequence) < hist.decoders.nSeqs {
+		if b.lowMem {
+			b.sequence = make([]seqVals, 0, hist.decoders.nSeqs)
+		} else {
+			b.sequence = make([]seqVals, 0, 0x7F00+0xffff)
+		}
 	}
-	br := &bitReader{}
-	if err := br.init(in); err != nil {
-		return err
+	b.sequence = b.sequence[:hist.decoders.nSeqs]
+	if hist.decoders.nSeqs == 0 {
+		hist.decoders.seqSize = len(hist.decoders.literals)
+		return nil
 	}
+	hist.decoders.windowSize = hist.windowSize
+	hist.decoders.prevOffset = hist.recentOffsets
 
-	// TODO: Investigate if sending history without decoders are faster.
-	//   This would allow the sequences to be decoded async and only have to construct stream history.
-	//   If only recent offsets were not transferred, this would be an obvious win.
-	// 	 Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded.
+	err := hist.decoders.decode(b.sequence)
+	hist.recentOffsets = hist.decoders.prevOffset
+	return err
+}
 
+func (b *blockDec) executeSequences(hist *history) error {
 	hbytes := hist.b
 	if len(hbytes) > hist.windowSize {
 		hbytes = hbytes[len(hbytes)-hist.windowSize:]
-		// We do not need history any more.
+		// We do not need history anymore.
 		if hist.dict != nil {
 			hist.dict.content = nil
 		}
 	}
-
-	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
-		println("initializing sequences:", err)
-		return err
-	}
-
-	err = seqs.decode(nSeqs, br, hbytes)
+	hist.decoders.windowSize = hist.windowSize
+	hist.decoders.out = b.dst[:0]
+	err := hist.decoders.execute(b.sequence, hbytes)
 	if err != nil {
 		return err
 	}
-	if !br.finished() {
-		return fmt.Errorf("%d extra bits on block, should be 0", br.remain())
-	}
+	return b.updateHistory(hist)
+}
 
-	err = br.close()
-	if err != nil {
-		printf("Closing sequences: %v, %+v\n", err, *br)
-	}
+func (b *blockDec) updateHistory(hist *history) error {
 	if len(b.data) > maxCompressedBlockSize {
 		return fmt.Errorf("compressed block size too large (%d)", len(b.data))
 	}
 	// Set output and release references.
-	b.dst = seqs.out
-	seqs.out, seqs.literals, seqs.hist = nil, nil, nil
+	b.dst = hist.decoders.out
+	hist.recentOffsets = hist.decoders.prevOffset
 
-	if !delayedHistory {
-		// If we don't have delayed history, no need to update.
-		hist.recentOffsets = seqs.prevOffset
-		return nil
-	}
 	if b.Last {
 		// if last block we don't care about history.
 		println("Last block, no history returned")
 		hist.b = hist.b[:0]
 		return nil
+	} else {
+		hist.append(b.dst)
+		if debugDecoder {
+			println("Finished block with ", len(b.sequence), "sequences. Added", len(b.dst), "to history, now length", len(hist.b))
+		}
 	}
-	hist.append(b.dst)
-	hist.recentOffsets = seqs.prevOffset
-	if debug {
-		println("Finished block with literals:", len(literals), "and", nSeqs, "sequences.")
-	}
+	hist.decoders.out, hist.decoders.literals = nil, nil
 
 	return nil
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
index c85c40255d..32a7f401d5 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@@ -22,28 +22,44 @@ type blockEnc struct {
 	dictLitEnc *huff0.Scratch
 	wr         bitWriter
 
-	extraLits int
-	last      bool
-
+	extraLits         int
 	output            []byte
 	recentOffsets     [3]uint32
 	prevRecentOffsets [3]uint32
+
+	last   bool
+	lowMem bool
 }
 
 // init should be used once the block has been created.
 // If called more than once, the effect is the same as calling reset.
 func (b *blockEnc) init() {
-	if cap(b.literals) < maxCompressedLiteralSize {
-		b.literals = make([]byte, 0, maxCompressedLiteralSize)
-	}
-	const defSeqs = 200
-	b.literals = b.literals[:0]
-	if cap(b.sequences) < defSeqs {
-		b.sequences = make([]seq, 0, defSeqs)
-	}
-	if cap(b.output) < maxCompressedBlockSize {
-		b.output = make([]byte, 0, maxCompressedBlockSize)
+	if b.lowMem {
+		// 1K literals
+		if cap(b.literals) < 1<<10 {
+			b.literals = make([]byte, 0, 1<<10)
+		}
+		const defSeqs = 20
+		if cap(b.sequences) < defSeqs {
+			b.sequences = make([]seq, 0, defSeqs)
+		}
+		// 1K
+		if cap(b.output) < 1<<10 {
+			b.output = make([]byte, 0, 1<<10)
+		}
+	} else {
+		if cap(b.literals) < maxCompressedBlockSize {
+			b.literals = make([]byte, 0, maxCompressedBlockSize)
+		}
+		const defSeqs = 2000
+		if cap(b.sequences) < defSeqs {
+			b.sequences = make([]seq, 0, defSeqs)
+		}
+		if cap(b.output) < maxCompressedBlockSize {
+			b.output = make([]byte, 0, maxCompressedBlockSize)
+		}
 	}
+
 	if b.coders.mlEnc == nil {
 		b.coders.mlEnc = &fseEncoder{}
 		b.coders.mlPrev = &fseEncoder{}
@@ -140,7 +156,7 @@ func (h *literalsHeader) setSize(regenLen int) {
 	switch {
 	case inBits < 5:
 		lh |= (uint64(regenLen) << 3) | (1 << 60)
-		if debug {
+		if debugEncoder {
 			got := int(lh>>3) & 0xff
 			if got != regenLen {
 				panic(fmt.Sprint("litRegenSize = ", regenLen, "(want) != ", got, "(got)"))
@@ -168,7 +184,7 @@ func (h *literalsHeader) setSizes(compLen, inLen int, single bool) {
 			lh |= 1 << 2
 		}
 		lh |= (uint64(inLen) << 4) | (uint64(compLen) << (10 + 4)) | (3 << 60)
-		if debug {
+		if debugEncoder {
 			const mmask = (1 << 24) - 1
 			n := (lh >> 4) & mmask
 			if int(n&1023) != inLen {
@@ -296,7 +312,7 @@ func (b *blockEnc) encodeRaw(a []byte) {
 	bh.setType(blockTypeRaw)
 	b.output = bh.appendTo(b.output[:0])
 	b.output = append(b.output, a...)
-	if debug {
+	if debugEncoder {
 		println("Adding RAW block, length", len(a), "last:", b.last)
 	}
 }
@@ -309,7 +325,7 @@ func (b *blockEnc) encodeRawTo(dst, src []byte) []byte {
 	bh.setType(blockTypeRaw)
 	dst = bh.appendTo(dst)
 	dst = append(dst, src...)
-	if debug {
+	if debugEncoder {
 		println("Adding RAW block, length", len(src), "last:", b.last)
 	}
 	return dst
@@ -323,7 +339,7 @@ func (b *blockEnc) encodeLits(lits []byte, raw bool) error {
 
 	// Don't compress extremely small blocks
 	if len(lits) < 8 || (len(lits) < 32 && b.dictLitEnc == nil) || raw {
-		if debug {
+		if debugEncoder {
 			println("Adding RAW block, length", len(lits), "last:", b.last)
 		}
 		bh.setType(blockTypeRaw)
@@ -345,17 +361,24 @@ func (b *blockEnc) encodeLits(lits []byte, raw bool) error {
 	if len(lits) >= 1024 {
 		// Use 4 Streams.
 		out, reUsed, err = huff0.Compress4X(lits, b.litEnc)
-	} else if len(lits) > 32 {
+	} else if len(lits) > 16 {
 		// Use 1 stream
 		single = true
 		out, reUsed, err = huff0.Compress1X(lits, b.litEnc)
 	} else {
 		err = huff0.ErrIncompressible
 	}
-
+	if err == nil && len(out)+5 > len(lits) {
+		// If we are close, we may still be worse or equal to raw.
+		var lh literalsHeader
+		lh.setSizes(len(out), len(lits), single)
+		if len(out)+lh.size() >= len(lits) {
+			err = huff0.ErrIncompressible
+		}
+	}
 	switch err {
 	case huff0.ErrIncompressible:
-		if debug {
+		if debugEncoder {
 			println("Adding RAW block, length", len(lits), "last:", b.last)
 		}
 		bh.setType(blockTypeRaw)
@@ -363,16 +386,16 @@ func (b *blockEnc) encodeLits(lits []byte, raw bool) error {
 		b.output = append(b.output, lits...)
 		return nil
 	case huff0.ErrUseRLE:
-		if debug {
+		if debugEncoder {
 			println("Adding RLE block, length", len(lits))
 		}
 		bh.setType(blockTypeRLE)
 		b.output = bh.appendTo(b.output)
 		b.output = append(b.output, lits[0])
 		return nil
+	case nil:
 	default:
 		return err
-	case nil:
 	}
 	// Compressed...
 	// Now, allow reuse
@@ -380,12 +403,12 @@ func (b *blockEnc) encodeLits(lits []byte, raw bool) error {
 	bh.setType(blockTypeCompressed)
 	var lh literalsHeader
 	if reUsed {
-		if debug {
+		if debugEncoder {
 			println("Reused tree, compressed to", len(out))
 		}
 		lh.setType(literalsBlockTreeless)
 	} else {
-		if debug {
+		if debugEncoder {
 			println("New tree, compressed to", len(out), "tree size:", len(b.litEnc.OutTable))
 		}
 		lh.setType(literalsBlockCompressed)
@@ -404,13 +427,23 @@ func (b *blockEnc) encodeLits(lits []byte, raw bool) error {
 	return nil
 }
 
+// encodeRLE will encode an RLE block.
+func (b *blockEnc) encodeRLE(val byte, length uint32) {
+	var bh blockHeader
+	bh.setLast(b.last)
+	bh.setSize(length)
+	bh.setType(blockTypeRLE)
+	b.output = bh.appendTo(b.output)
+	b.output = append(b.output, val)
+}
+
 // fuzzFseEncoder can be used to fuzz the FSE encoder.
 func fuzzFseEncoder(data []byte) int {
 	if len(data) > maxSequences || len(data) < 2 {
 		return 0
 	}
 	enc := fseEncoder{}
-	hist := enc.Histogram()[:256]
+	hist := enc.Histogram()
 	maxSym := uint8(0)
 	for i, v := range data {
 		v = v & 63
@@ -456,8 +489,18 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 	if len(b.sequences) == 0 {
 		return b.encodeLits(b.literals, rawAllLits)
 	}
+	if len(b.sequences) == 1 && len(org) > 0 && len(b.literals) <= 1 {
+		// Check common RLE cases.
+		seq := b.sequences[0]
+		if seq.litLen == uint32(len(b.literals)) && seq.offset-3 == 1 {
+			// Offset == 1 and 0 or 1 literals.
+			b.encodeRLE(org[0], b.sequences[0].matchLen+zstdMinMatch+seq.litLen)
+			return nil
+		}
+	}
+
 	// We want some difference to at least account for the headers.
-	saved := b.size - len(b.literals) - (b.size >> 5)
+	saved := b.size - len(b.literals) - (b.size >> 6)
 	if saved < 16 {
 		if org == nil {
 			return errIncompressible
@@ -487,7 +530,7 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 	if len(b.literals) >= 1024 && !raw {
 		// Use 4 Streams.
 		out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
-	} else if len(b.literals) > 32 && !raw {
+	} else if len(b.literals) > 16 && !raw {
 		// Use 1 stream
 		single = true
 		out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc)
@@ -495,13 +538,24 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 		err = huff0.ErrIncompressible
 	}
 
+	if err == nil && len(out)+5 > len(b.literals) {
+		// If we are close, we may still be worse or equal to raw.
+		var lh literalsHeader
+		lh.setSize(len(b.literals))
+		szRaw := lh.size()
+		lh.setSizes(len(out), len(b.literals), single)
+		szComp := lh.size()
+		if len(out)+szComp >= len(b.literals)+szRaw {
+			err = huff0.ErrIncompressible
+		}
+	}
 	switch err {
 	case huff0.ErrIncompressible:
 		lh.setType(literalsBlockRaw)
 		lh.setSize(len(b.literals))
 		b.output = lh.appendTo(b.output)
 		b.output = append(b.output, b.literals...)
-		if debug {
+		if debugEncoder {
 			println("Adding literals RAW, length", len(b.literals))
 		}
 	case huff0.ErrUseRLE:
@@ -509,27 +563,22 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 		lh.setSize(len(b.literals))
 		b.output = lh.appendTo(b.output)
 		b.output = append(b.output, b.literals[0])
-		if debug {
+		if debugEncoder {
 			println("Adding literals RLE")
 		}
-	default:
-		if debug {
-			println("Adding literals ERROR:", err)
-		}
-		return err
 	case nil:
 		// Compressed litLen...
 		if reUsed {
-			if debug {
+			if debugEncoder {
 				println("reused tree")
 			}
 			lh.setType(literalsBlockTreeless)
 		} else {
-			if debug {
+			if debugEncoder {
 				println("new tree, size:", len(b.litEnc.OutTable))
 			}
 			lh.setType(literalsBlockCompressed)
-			if debug {
+			if debugEncoder {
 				_, _, err := huff0.ReadTable(out, nil)
 				if err != nil {
 					panic(err)
@@ -537,16 +586,21 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 			}
 		}
 		lh.setSizes(len(out), len(b.literals), single)
-		if debug {
+		if debugEncoder {
 			printf("Compressed %d literals to %d bytes", len(b.literals), len(out))
 			println("Adding literal header:", lh)
 		}
 		b.output = lh.appendTo(b.output)
 		b.output = append(b.output, out...)
 		b.litEnc.Reuse = huff0.ReusePolicyAllow
-		if debug {
+		if debugEncoder {
 			println("Adding literals compressed")
 		}
+	default:
+		if debugEncoder {
+			println("Adding literals ERROR:", err)
+		}
+		return err
 	}
 	// Sequence compression
 
@@ -561,7 +615,7 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 		n := len(b.sequences) - 0x7f00
 		b.output = append(b.output, 255, uint8(n), uint8(n>>8))
 	}
-	if debug {
+	if debugEncoder {
 		println("Encoding", len(b.sequences), "sequences")
 	}
 	b.genCodes()
@@ -595,17 +649,17 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 		nSize = nSize + (nSize+2*8*16)>>4
 		switch {
 		case predefSize <= prevSize && predefSize <= nSize || forcePreDef:
-			if debug {
+			if debugEncoder {
 				println("Using predefined", predefSize>>3, "<=", nSize>>3)
 			}
 			return preDef, compModePredefined
 		case prevSize <= nSize:
-			if debug {
+			if debugEncoder {
 				println("Using previous", prevSize>>3, "<=", nSize>>3)
 			}
 			return prev, compModeRepeat
 		default:
-			if debug {
+			if debugEncoder {
 				println("Using new, predef", predefSize>>3, ". previous:", prevSize>>3, ">", nSize>>3, "header max:", cur.maxHeaderSize()>>3, "bytes")
 				println("tl:", cur.actualTableLog, "symbolLen:", cur.symbolLen, "norm:", cur.norm[:cur.symbolLen], "hist", cur.count[:cur.symbolLen])
 			}
@@ -618,7 +672,7 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 	if llEnc.useRLE {
 		mode |= uint8(compModeRLE) << 6
 		llEnc.setRLE(b.sequences[0].llCode)
-		if debug {
+		if debugEncoder {
 			println("llEnc.useRLE")
 		}
 	} else {
@@ -629,7 +683,7 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 	if ofEnc.useRLE {
 		mode |= uint8(compModeRLE) << 4
 		ofEnc.setRLE(b.sequences[0].ofCode)
-		if debug {
+		if debugEncoder {
 			println("ofEnc.useRLE")
 		}
 	} else {
@@ -641,7 +695,7 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 	if mlEnc.useRLE {
 		mode |= uint8(compModeRLE) << 2
 		mlEnc.setRLE(b.sequences[0].mlCode)
-		if debug {
+		if debugEncoder {
 			println("mlEnc.useRLE, code: ", b.sequences[0].mlCode, "value", b.sequences[0].matchLen)
 		}
 	} else {
@@ -650,7 +704,7 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 		mode |= uint8(m) << 2
 	}
 	b.output = append(b.output, mode)
-	if debug {
+	if debugEncoder {
 		printf("Compression modes: 0b%b", mode)
 	}
 	b.output, err = llEnc.writeCount(b.output)
@@ -706,71 +760,72 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 		println("Encoded seq", seq, s, "codes:", s.llCode, s.mlCode, s.ofCode, "states:", ll.state, ml.state, of.state, "bits:", llB, mlB, ofB)
 	}
 	seq--
-	if llEnc.maxBits+mlEnc.maxBits+ofEnc.maxBits <= 32 {
-		// No need to flush (common)
-		for seq >= 0 {
-			s = b.sequences[seq]
-			wr.flush32()
-			llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
-			// tabelog max is 8 for all.
-			of.encode(ofB)
-			ml.encode(mlB)
-			ll.encode(llB)
-			wr.flush32()
-
-			// We checked that all can stay within 32 bits
-			wr.addBits32NC(s.litLen, llB.outBits)
-			wr.addBits32NC(s.matchLen, mlB.outBits)
-			wr.addBits32NC(s.offset, ofB.outBits)
-
-			if debugSequences {
-				println("Encoded seq", seq, s)
-			}
-
-			seq--
+	// Store sequences in reverse...
+	for seq >= 0 {
+		s = b.sequences[seq]
+
+		ofB := ofTT[s.ofCode]
+		wr.flush32() // tablelog max is below 8 for each, so it will fill max 24 bits.
+		//of.encode(ofB)
+		nbBitsOut := (uint32(of.state) + ofB.deltaNbBits) >> 16
+		dstState := int32(of.state>>(nbBitsOut&15)) + int32(ofB.deltaFindState)
+		wr.addBits16NC(of.state, uint8(nbBitsOut))
+		of.state = of.stateTable[dstState]
+
+		// Accumulate extra bits.
+		outBits := ofB.outBits & 31
+		extraBits := uint64(s.offset & bitMask32[outBits])
+		extraBitsN := outBits
+
+		mlB := mlTT[s.mlCode]
+		//ml.encode(mlB)
+		nbBitsOut = (uint32(ml.state) + mlB.deltaNbBits) >> 16
+		dstState = int32(ml.state>>(nbBitsOut&15)) + int32(mlB.deltaFindState)
+		wr.addBits16NC(ml.state, uint8(nbBitsOut))
+		ml.state = ml.stateTable[dstState]
+
+		outBits = mlB.outBits & 31
+		extraBits = extraBits<<outBits | uint64(s.matchLen&bitMask32[outBits])
+		extraBitsN += outBits
+
+		llB := llTT[s.llCode]
+		//ll.encode(llB)
+		nbBitsOut = (uint32(ll.state) + llB.deltaNbBits) >> 16
+		dstState = int32(ll.state>>(nbBitsOut&15)) + int32(llB.deltaFindState)
+		wr.addBits16NC(ll.state, uint8(nbBitsOut))
+		ll.state = ll.stateTable[dstState]
+
+		outBits = llB.outBits & 31
+		extraBits = extraBits<<outBits | uint64(s.litLen&bitMask32[outBits])
+		extraBitsN += outBits
+
+		wr.flush32()
+		wr.addBits64NC(extraBits, extraBitsN)
+
+		if debugSequences {
+			println("Encoded seq", seq, s)
 		}
-	} else {
-		for seq >= 0 {
-			s = b.sequences[seq]
-			wr.flush32()
-			llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
-			// tabelog max is below 8 for each.
-			of.encode(ofB)
-			ml.encode(mlB)
-			ll.encode(llB)
-			wr.flush32()
-
-			// ml+ll = max 32 bits total
-			wr.addBits32NC(s.litLen, llB.outBits)
-			wr.addBits32NC(s.matchLen, mlB.outBits)
-			wr.flush32()
-			wr.addBits32NC(s.offset, ofB.outBits)
-
-			if debugSequences {
-				println("Encoded seq", seq, s)
-			}
 
-			seq--
-		}
+		seq--
 	}
 	ml.flush(mlEnc.actualTableLog)
 	of.flush(ofEnc.actualTableLog)
 	ll.flush(llEnc.actualTableLog)
-	err = wr.close()
-	if err != nil {
-		return err
-	}
+	wr.close()
 	b.output = wr.out
 
+	// Maybe even add a bigger margin.
 	if len(b.output)-3-bhOffset >= b.size {
-		// Maybe even add a bigger margin.
+		// Discard and encode as raw block.
+		b.output = b.encodeRawTo(b.output[:bhOffset], org)
+		b.popOffsets()
 		b.litEnc.Reuse = huff0.ReusePolicyNone
-		return errIncompressible
+		return nil
 	}
 
 	// Size is output minus block header.
 	bh.setSize(uint32(len(b.output)-bhOffset) - 3)
-	if debug {
+	if debugEncoder {
 		println("Rewriting block header", bh)
 	}
 	_ = bh.appendTo(b.output[bhOffset:bhOffset])
@@ -785,14 +840,13 @@ func (b *blockEnc) genCodes() {
 		// nothing to do
 		return
 	}
-
 	if len(b.sequences) > math.MaxUint16 {
 		panic("can only encode up to 64K sequences")
 	}
 	// No bounds checks after here:
-	llH := b.coders.llEnc.Histogram()[:256]
-	ofH := b.coders.ofEnc.Histogram()[:256]
-	mlH := b.coders.mlEnc.Histogram()[:256]
+	llH := b.coders.llEnc.Histogram()
+	ofH := b.coders.ofEnc.Histogram()
+	mlH := b.coders.mlEnc.Histogram()
 	for i := range llH {
 		llH[i] = 0
 	}
@@ -804,7 +858,8 @@ func (b *blockEnc) genCodes() {
 	}
 
 	var llMax, ofMax, mlMax uint8
-	for i, seq := range b.sequences {
+	for i := range b.sequences {
+		seq := &b.sequences[i]
 		v := llCode(seq.litLen)
 		seq.llCode = v
 		llH[v]++
@@ -828,7 +883,6 @@ func (b *blockEnc) genCodes() {
 				panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d), matchlen: %d", mlMax, seq.matchLen))
 			}
 		}
-		b.sequences[i] = seq
 	}
 	maxCount := func(a []uint32) int {
 		var max uint32
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index 658ef78380..55a388553d 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -7,13 +7,12 @@ package zstd
 import (
 	"fmt"
 	"io"
-	"io/ioutil"
 )
 
 type byteBuffer interface {
 	// Read up to 8 bytes.
-	// Returns nil if no more input is available.
-	readSmall(n int) []byte
+	// Returns io.ErrUnexpectedEOF if this cannot be satisfied.
+	readSmall(n int) ([]byte, error)
 
 	// Read >8 bytes.
 	// MAY use the destination slice.
@@ -23,23 +22,23 @@ type byteBuffer interface {
 	readByte() (byte, error)
 
 	// Skip n bytes.
-	skipN(n int) error
+	skipN(n int64) error
 }
 
 // in-memory buffer
 type byteBuf []byte
 
-func (b *byteBuf) readSmall(n int) []byte {
+func (b *byteBuf) readSmall(n int) ([]byte, error) {
 	if debugAsserts && n > 8 {
 		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
 	}
 	bb := *b
 	if len(bb) < n {
-		return nil
+		return nil, io.ErrUnexpectedEOF
 	}
 	r := bb[:n]
 	*b = bb[n:]
-	return r
+	return r, nil
 }
 
 func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
@@ -52,23 +51,22 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
 	return r, nil
 }
 
-func (b *byteBuf) remain() []byte {
-	return *b
-}
-
 func (b *byteBuf) readByte() (byte, error) {
 	bb := *b
 	if len(bb) < 1 {
-		return 0, nil
+		return 0, io.ErrUnexpectedEOF
 	}
 	r := bb[0]
 	*b = bb[1:]
 	return r, nil
 }
 
-func (b *byteBuf) skipN(n int) error {
+func (b *byteBuf) skipN(n int64) error {
 	bb := *b
-	if len(bb) < n {
+	if n < 0 {
+		return fmt.Errorf("negative skip (%d) requested", n)
+	}
+	if int64(len(bb)) < n {
 		return io.ErrUnexpectedEOF
 	}
 	*b = bb[n:]
@@ -81,19 +79,22 @@ type readerWrapper struct {
 	tmp [8]byte
 }
 
-func (r *readerWrapper) readSmall(n int) []byte {
+func (r *readerWrapper) readSmall(n int) ([]byte, error) {
 	if debugAsserts && n > 8 {
 		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
 	}
 	n2, err := io.ReadFull(r.r, r.tmp[:n])
 	// We only really care about the actual bytes read.
-	if n2 != n {
-		if debug {
+	if err != nil {
+		if err == io.EOF {
+			return nil, io.ErrUnexpectedEOF
+		}
+		if debugDecoder {
 			println("readSmall: got", n2, "want", n, "err", err)
 		}
-		return nil
+		return nil, err
 	}
-	return r.tmp[:n]
+	return r.tmp[:n], nil
 }
 
 func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) {
@@ -108,8 +109,11 @@ func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) {
 }
 
 func (r *readerWrapper) readByte() (byte, error) {
-	n2, err := r.r.Read(r.tmp[:1])
+	n2, err := io.ReadFull(r.r, r.tmp[:1])
 	if err != nil {
+		if err == io.EOF {
+			err = io.ErrUnexpectedEOF
+		}
 		return 0, err
 	}
 	if n2 != 1 {
@@ -118,9 +122,9 @@ func (r *readerWrapper) readByte() (byte, error) {
 	return r.tmp[0], nil
 }
 
-func (r *readerWrapper) skipN(n int) error {
-	n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
-	if n2 != int64(n) {
+func (r *readerWrapper) skipN(n int64) error {
+	n2, err := io.CopyN(io.Discard, r.r, n)
+	if n2 != n {
 		err = io.ErrUnexpectedEOF
 	}
 	return err
diff --git a/vendor/github.com/klauspost/compress/zstd/bytereader.go b/vendor/github.com/klauspost/compress/zstd/bytereader.go
index 2c4fca17fa..0e59a242d8 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@@ -13,12 +13,6 @@ type byteReader struct {
 	off int
 }
 
-// init will initialize the reader and set the input.
-func (b *byteReader) init(in []byte) {
-	b.b = in
-	b.off = 0
-}
-
 // advance the stream b n bytes.
 func (b *byteReader) advance(n uint) {
 	b.off += int(n)
diff --git a/vendor/github.com/klauspost/compress/zstd/decodeheader.go b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
index 87896c5eaa..6a5a2988b6 100644
--- a/vendor/github.com/klauspost/compress/zstd/decodeheader.go
+++ b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
@@ -4,7 +4,7 @@
 package zstd
 
 import (
-	"bytes"
+	"encoding/binary"
 	"errors"
 	"io"
 )
@@ -15,18 +15,50 @@ const HeaderMaxSize = 14 + 3
 
 // Header contains information about the first frame and block within that.
 type Header struct {
-	// Window Size the window of data to keep while decoding.
-	// Will only be set if HasFCS is false.
-	WindowSize uint64
+	// SingleSegment specifies whether the data is to be decompressed into a
+	// single contiguous memory segment.
+	// It implies that WindowSize is invalid and that FrameContentSize is valid.
+	SingleSegment bool
 
-	// Frame content size.
-	// Expected size of the entire frame.
-	FrameContentSize uint64
+	// WindowSize is the window of data to keep while decoding.
+	// Will only be set if SingleSegment is false.
+	WindowSize uint64
 
 	// Dictionary ID.
 	// If 0, no dictionary.
 	DictionaryID uint32
 
+	// HasFCS specifies whether FrameContentSize has a valid value.
+	HasFCS bool
+
+	// FrameContentSize is the expected uncompressed size of the entire frame.
+	FrameContentSize uint64
+
+	// Skippable will be true if the frame is meant to be skipped.
+	// This implies that FirstBlock.OK is false.
+	Skippable bool
+
+	// SkippableID is the user-specific ID for the skippable frame.
+	// Valid values are between 0 to 15, inclusive.
+	SkippableID int
+
+	// SkippableSize is the length of the user data to skip following
+	// the header.
+	SkippableSize uint32
+
+	// HeaderSize is the raw size of the frame header.
+	//
+	// For normal frames, it includes the size of the magic number and
+	// the size of the header (per section 3.1.1.1).
+	// It does not include the size for any data blocks (section 3.1.1.2) nor
+	// the size for the trailing content checksum.
+	//
+	// For skippable frames, this counts the size of the magic number
+	// along with the size of the size field of the payload.
+	// It does not include the size of the skippable payload itself.
+	// The total frame size is the HeaderSize plus the SkippableSize.
+	HeaderSize int
+
 	// First block information.
 	FirstBlock struct {
 		// OK will be set if first block could be decoded.
@@ -51,17 +83,9 @@ type Header struct {
 		CompressedSize int
 	}
 
-	// Skippable will be true if the frame is meant to be skipped.
-	// No other information will be populated.
-	Skippable bool
-
 	// If set there is a checksum present for the block content.
+	// The checksum field at the end is always 4 bytes long.
 	HasCheckSum bool
-
-	// If this is true FrameContentSize will have a valid value
-	HasFCS bool
-
-	SingleSegment bool
 }
 
 // Decode the header from the beginning of the stream.
@@ -71,39 +95,58 @@ type Header struct {
 // If there isn't enough input, io.ErrUnexpectedEOF is returned.
 // The FirstBlock.OK will indicate if enough information was available to decode the first block header.
 func (h *Header) Decode(in []byte) error {
+	_, err := h.DecodeAndStrip(in)
+	return err
+}
+
+// DecodeAndStrip will decode the header from the beginning of the stream
+// and on success return the remaining bytes.
+// This will decode the frame header and the first block header if enough bytes are provided.
+// It is recommended to provide at least HeaderMaxSize bytes.
+// If the frame header cannot be read an error will be returned.
+// If there isn't enough input, io.ErrUnexpectedEOF is returned.
+// The FirstBlock.OK will indicate if enough information was available to decode the first block header.
+func (h *Header) DecodeAndStrip(in []byte) (remain []byte, err error) {
+	*h = Header{}
 	if len(in) < 4 {
-		return io.ErrUnexpectedEOF
+		return nil, io.ErrUnexpectedEOF
 	}
+	h.HeaderSize += 4
 	b, in := in[:4], in[4:]
-	if !bytes.Equal(b, frameMagic) {
-		if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
-			return ErrMagicMismatch
+	if string(b) != frameMagic {
+		if string(b[1:4]) != skippableFrameMagic || b[0]&0xf0 != 0x50 {
+			return nil, ErrMagicMismatch
+		}
+		if len(in) < 4 {
+			return nil, io.ErrUnexpectedEOF
 		}
-		*h = Header{Skippable: true}
-		return nil
+		h.HeaderSize += 4
+		h.Skippable = true
+		h.SkippableID = int(b[0] & 0xf)
+		h.SkippableSize = binary.LittleEndian.Uint32(in)
+		return in[4:], nil
 	}
+
+	// Read Window_Descriptor
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
 	if len(in) < 1 {
-		return io.ErrUnexpectedEOF
+		return nil, io.ErrUnexpectedEOF
 	}
-
-	// Clear output
-	*h = Header{}
 	fhd, in := in[0], in[1:]
+	h.HeaderSize++
 	h.SingleSegment = fhd&(1<<5) != 0
 	h.HasCheckSum = fhd&(1<<2) != 0
-
 	if fhd&(1<<3) != 0 {
-		return errors.New("Reserved bit set on frame header")
+		return nil, errors.New("reserved bit set on frame header")
 	}
 
-	// Read Window_Descriptor
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
 	if !h.SingleSegment {
 		if len(in) < 1 {
-			return io.ErrUnexpectedEOF
+			return nil, io.ErrUnexpectedEOF
 		}
 		var wd byte
 		wd, in = in[0], in[1:]
+		h.HeaderSize++
 		windowLog := 10 + (wd >> 3)
 		windowBase := uint64(1) << windowLog
 		windowAdd := (windowBase / 8) * uint64(wd&0x7)
@@ -117,13 +160,11 @@ func (h *Header) Decode(in []byte) error {
 			size = 4
 		}
 		if len(in) < int(size) {
-			return io.ErrUnexpectedEOF
+			return nil, io.ErrUnexpectedEOF
 		}
 		b, in = in[:size], in[size:]
-		if b == nil {
-			return io.ErrUnexpectedEOF
-		}
-		switch size {
+		h.HeaderSize += int(size)
+		switch len(b) {
 		case 1:
 			h.DictionaryID = uint32(b[0])
 		case 2:
@@ -149,13 +190,11 @@ func (h *Header) Decode(in []byte) error {
 	if fcsSize > 0 {
 		h.HasFCS = true
 		if len(in) < fcsSize {
-			return io.ErrUnexpectedEOF
+			return nil, io.ErrUnexpectedEOF
 		}
 		b, in = in[:fcsSize], in[fcsSize:]
-		if b == nil {
-			return io.ErrUnexpectedEOF
-		}
-		switch fcsSize {
+		h.HeaderSize += int(fcsSize)
+		switch len(b) {
 		case 1:
 			h.FrameContentSize = uint64(b[0])
 		case 2:
@@ -172,9 +211,9 @@ func (h *Header) Decode(in []byte) error {
 
 	// Frame Header done, we will not fail from now on.
 	if len(in) < 3 {
-		return nil
+		return in, nil
 	}
-	tmp, in := in[:3], in[3:]
+	tmp := in[:3]
 	bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
 	h.FirstBlock.Last = bh&1 != 0
 	blockType := blockType((bh >> 1) & 3)
@@ -182,7 +221,7 @@ func (h *Header) Decode(in []byte) error {
 	cSize := int(bh >> 3)
 	switch blockType {
 	case blockTypeReserved:
-		return nil
+		return in, nil
 	case blockTypeRLE:
 		h.FirstBlock.Compressed = true
 		h.FirstBlock.DecompressedSize = cSize
@@ -198,5 +237,25 @@ func (h *Header) Decode(in []byte) error {
 	}
 
 	h.FirstBlock.OK = true
-	return nil
+	return in, nil
+}
+
+// AppendTo will append the encoded header to the dst slice.
+// There is no error checking performed on the header values.
+func (h *Header) AppendTo(dst []byte) ([]byte, error) {
+	if h.Skippable {
+		magic := [4]byte{0x50, 0x2a, 0x4d, 0x18}
+		magic[0] |= byte(h.SkippableID & 0xf)
+		dst = append(dst, magic[:]...)
+		f := h.SkippableSize
+		return append(dst, uint8(f), uint8(f>>8), uint8(f>>16), uint8(f>>24)), nil
+	}
+	f := frameHeader{
+		ContentSize:   h.FrameContentSize,
+		WindowSize:    uint32(h.WindowSize),
+		SingleSegment: h.SingleSegment,
+		Checksum:      h.HasCheckSum,
+		DictID:        h.DictionaryID,
+	}
+	return f.appendTo(dst), nil
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index cdda0de58b..bbca17234a 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -5,10 +5,12 @@
 package zstd
 
 import (
-	"bytes"
-	"errors"
+	"context"
+	"encoding/binary"
 	"io"
 	"sync"
+
+	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
 // Decoder provides decoding of zstandard streams.
@@ -23,15 +25,22 @@ type Decoder struct {
 	// Unreferenced decoders, ready for use.
 	decoders chan *blockDec
 
-	// Streams ready to be decoded.
-	stream chan decodeStream
-
 	// Current read position used for Reader functionality.
 	current decoderState
 
+	// sync stream decoding
+	syncStream struct {
+		decodedFrame uint64
+		br           readerWrapper
+		enabled      bool
+		inFrame      bool
+		dstBuf       []byte
+	}
+
+	frame *frameDec
+
 	// Custom dictionaries.
-	// Always uses copies.
-	dicts map[uint32]dict
+	dicts map[uint32]*dict
 
 	// streamWg is the waitgroup for all streams
 	streamWg sync.WaitGroup
@@ -47,7 +56,10 @@ type decoderState struct {
 	output chan decodeOutput
 
 	// cancel remaining output.
-	cancel chan struct{}
+	cancel context.CancelFunc
+
+	// crc of current frame
+	crc *xxhash.Digest
 
 	flushed bool
 }
@@ -70,7 +82,7 @@ var (
 // can run multiple concurrent stateless decodes. It is even possible to
 // use stateless decodes while a stream is being decoded.
 //
-// The Reset function can be used to initiate a new stream, which is will considerably
+// The Reset function can be used to initiate a new stream, which will considerably
 // reduce the allocations normally caused by NewReader.
 func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 	initPredefined()
@@ -82,11 +94,15 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 			return nil, err
 		}
 	}
-	d.current.output = make(chan decodeOutput, d.o.concurrent)
+	d.current.crc = xxhash.New()
 	d.current.flushed = true
 
+	if r == nil {
+		d.current.err = ErrDecoderNilInput
+	}
+
 	// Transfer option dicts.
-	d.dicts = make(map[uint32]dict, len(d.o.dicts))
+	d.dicts = make(map[uint32]*dict, len(d.o.dicts))
 	for _, dc := range d.o.dicts {
 		d.dicts[dc.id] = dc
 	}
@@ -110,9 +126,6 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 // Returns the number of bytes written and any error that occurred.
 // When the stream is done, io.EOF will be returned.
 func (d *Decoder) Read(p []byte) (int, error) {
-	if d.stream == nil {
-		return 0, errors.New("no input has been initialized")
-	}
 	var n int
 	for {
 		if len(d.current.b) > 0 {
@@ -130,12 +143,12 @@ func (d *Decoder) Read(p []byte) (int, error) {
 				break
 			}
 			if !d.nextBlock(n == 0) {
-				return n, nil
+				return n, d.current.err
 			}
 		}
 	}
 	if len(d.current.b) > 0 {
-		if debug {
+		if debugDecoder {
 			println("returning", n, "still bytes left:", len(d.current.b))
 		}
 		// Only return error at end of block
@@ -144,7 +157,7 @@ func (d *Decoder) Read(p []byte) (int, error) {
 	if d.current.err != nil {
 		d.drainOutput()
 	}
-	if debug {
+	if debugDecoder {
 		println("returning", n, d.current.err, len(d.decoders))
 	}
 	return n, d.current.err
@@ -152,70 +165,90 @@ func (d *Decoder) Read(p []byte) (int, error) {
 
 // Reset will reset the decoder the supplied stream after the current has finished processing.
 // Note that this functionality cannot be used after Close has been called.
+// Reset can be called with a nil reader to release references to the previous reader.
+// After being called with a nil reader, no other operations than Reset or DecodeAll or Close
+// should be used.
 func (d *Decoder) Reset(r io.Reader) error {
 	if d.current.err == ErrDecoderClosed {
 		return d.current.err
 	}
-	if r == nil {
-		return errors.New("nil Reader sent as input")
-	}
-
-	if d.stream == nil {
-		d.stream = make(chan decodeStream, 1)
-		d.streamWg.Add(1)
-		go d.startStreamDecoder(d.stream)
-	}
 
 	d.drainOutput()
 
-	// If bytes buffer and < 1MB, do sync decoding anyway.
-	if bb, ok := r.(*bytes.Buffer); ok && bb.Len() < 1<<20 {
-		if debug {
+	d.syncStream.br.r = nil
+	if r == nil {
+		d.current.err = ErrDecoderNilInput
+		if len(d.current.b) > 0 {
+			d.current.b = d.current.b[:0]
+		}
+		d.current.flushed = true
+		return nil
+	}
+
+	// If bytes buffer and < 5MB, do sync decoding anyway.
+	if bb, ok := r.(byter); ok && bb.Len() < d.o.decodeBufsBelow && !d.o.limitToCap {
+		bb2 := bb
+		if debugDecoder {
 			println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
 		}
-		b := bb.Bytes()
+		b := bb2.Bytes()
 		var dst []byte
-		if cap(d.current.b) > 0 {
-			dst = d.current.b
+		if cap(d.syncStream.dstBuf) > 0 {
+			dst = d.syncStream.dstBuf[:0]
 		}
 
-		dst, err := d.DecodeAll(b, dst[:0])
+		dst, err := d.DecodeAll(b, dst)
 		if err == nil {
 			err = io.EOF
 		}
+		// Save output buffer
+		d.syncStream.dstBuf = dst
 		d.current.b = dst
 		d.current.err = err
 		d.current.flushed = true
-		if debug {
+		if debugDecoder {
 			println("sync decode to", len(dst), "bytes, err:", err)
 		}
 		return nil
 	}
-
 	// Remove current block.
+	d.stashDecoder()
 	d.current.decodeOutput = decodeOutput{}
 	d.current.err = nil
-	d.current.cancel = make(chan struct{})
 	d.current.flushed = false
 	d.current.d = nil
+	d.syncStream.dstBuf = nil
+
+	// Ensure no-one else is still running...
+	d.streamWg.Wait()
+	if d.frame == nil {
+		d.frame = newFrameDec(d.o)
+	}
 
-	d.stream <- decodeStream{
-		r:      r,
-		output: d.current.output,
-		cancel: d.current.cancel,
+	if d.o.concurrent == 1 {
+		return d.startSyncDecoder(r)
 	}
+
+	d.current.output = make(chan decodeOutput, d.o.concurrent)
+	ctx, cancel := context.WithCancel(context.Background())
+	d.current.cancel = cancel
+	d.streamWg.Add(1)
+	go d.startStreamDecoder(ctx, r, d.current.output)
+
 	return nil
 }
 
 // drainOutput will drain the output until errEndOfStream is sent.
 func (d *Decoder) drainOutput() {
 	if d.current.cancel != nil {
-		println("cancelling current")
-		close(d.current.cancel)
+		if debugDecoder {
+			println("cancelling current")
+		}
+		d.current.cancel()
 		d.current.cancel = nil
 	}
 	if d.current.d != nil {
-		if debug {
+		if debugDecoder {
 			printf("re-adding current decoder %p, decoders: %d", d.current.d, len(d.decoders))
 		}
 		d.decoders <- d.current.d
@@ -226,39 +259,31 @@ func (d *Decoder) drainOutput() {
 		println("current already flushed")
 		return
 	}
-	for {
-		select {
-		case v := <-d.current.output:
-			if v.d != nil {
-				if debug {
-					printf("re-adding decoder %p", v.d)
-				}
-				d.decoders <- v.d
-			}
-			if v.err == errEndOfStream {
-				println("current flushed")
-				d.current.flushed = true
-				return
+	for v := range d.current.output {
+		if v.d != nil {
+			if debugDecoder {
+				printf("re-adding decoder %p", v.d)
 			}
+			d.decoders <- v.d
 		}
 	}
+	d.current.output = nil
+	d.current.flushed = true
 }
 
 // WriteTo writes data to w until there's no more data to write or when an error occurs.
 // The return value n is the number of bytes written.
 // Any error encountered during the write is also returned.
 func (d *Decoder) WriteTo(w io.Writer) (int64, error) {
-	if d.stream == nil {
-		return 0, errors.New("no input has been initialized")
-	}
 	var n int64
 	for {
 		if len(d.current.b) > 0 {
 			n2, err2 := w.Write(d.current.b)
 			n += int64(n2)
-			if err2 != nil && d.current.err == nil {
+			if err2 != nil && (d.current.err == nil || d.current.err == io.EOF) {
 				d.current.err = err2
-				break
+			} else if n2 != len(d.current.b) {
+				d.current.err = io.ErrShortWrite
 			}
 		}
 		if d.current.err != nil {
@@ -282,19 +307,23 @@ func (d *Decoder) WriteTo(w io.Writer) (int64, error) {
 // DecodeAll can be used concurrently.
 // The Decoder concurrency limits will be respected.
 func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
-	if d.current.err == ErrDecoderClosed {
+	if d.decoders == nil {
 		return dst, ErrDecoderClosed
 	}
 
 	// Grab a block decoder and frame decoder.
 	block := <-d.decoders
 	frame := block.localFrame
+	initialSize := len(dst)
 	defer func() {
-		if debug {
+		if debugDecoder {
 			printf("re-adding decoder: %p", block)
 		}
 		frame.rawInput = nil
 		frame.bBuf = nil
+		if frame.history.decoders.br != nil {
+			frame.history.decoders.br.in = nil
+		}
 		d.decoders <- block
 	}()
 	frame.bBuf = input
@@ -302,34 +331,45 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	for {
 		frame.history.reset()
 		err := frame.reset(&frame.bBuf)
-		if err == io.EOF {
-			if debug {
-				println("frame reset return EOF")
-			}
-			return dst, nil
-		}
-		if frame.DictionaryID != nil {
-			dict, ok := d.dicts[*frame.DictionaryID]
-			if !ok {
-				return nil, ErrUnknownDictionary
-			}
-			frame.history.setDict(&dict)
-		}
 		if err != nil {
+			if err == io.EOF {
+				if debugDecoder {
+					println("frame reset return EOF")
+				}
+				return dst, nil
+			}
 			return dst, err
 		}
-		if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
-			return dst, ErrDecoderSizeExceeded
+		if err = d.setDict(frame); err != nil {
+			return nil, err
 		}
-		if frame.FrameContentSize > 0 && frame.FrameContentSize < 1<<30 {
-			// Never preallocate moe than 1 GB up front.
+		if frame.WindowSize > d.o.maxWindowSize {
+			if debugDecoder {
+				println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
+			}
+			return dst, ErrWindowSizeExceeded
+		}
+		if frame.FrameContentSize != fcsUnknown {
+			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)-initialSize) {
+				if debugDecoder {
+					println("decoder size exceeded; fcs:", frame.FrameContentSize, "> mcs:", d.o.maxDecodedSize-uint64(len(dst)-initialSize), "len:", len(dst))
+				}
+				return dst, ErrDecoderSizeExceeded
+			}
+			if d.o.limitToCap && frame.FrameContentSize > uint64(cap(dst)-len(dst)) {
+				if debugDecoder {
+					println("decoder size exceeded; fcs:", frame.FrameContentSize, "> (cap-len)", cap(dst)-len(dst))
+				}
+				return dst, ErrDecoderSizeExceeded
+			}
 			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
-				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
+				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc)
 				copy(dst2, dst)
 				dst = dst2
 			}
 		}
-		if cap(dst) == 0 {
+
+		if cap(dst) == 0 && !d.o.limitToCap {
 			// Allocate len(input) * 2 by default if nothing is provided
 			// and we didn't get frame content size.
 			size := len(input) * 2
@@ -347,8 +387,11 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 		if err != nil {
 			return dst, err
 		}
+		if uint64(len(dst)-initialSize) > d.o.maxDecodedSize {
+			return dst, ErrDecoderSizeExceeded
+		}
 		if len(frame.bBuf) == 0 {
-			if debug {
+			if debugDecoder {
 				println("frame dbuf empty")
 			}
 			break
@@ -363,33 +406,167 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 // If non-blocking mode is used the returned boolean will be false
 // if no data was available without blocking.
 func (d *Decoder) nextBlock(blocking bool) (ok bool) {
-	if d.current.d != nil {
-		if debug {
-			printf("re-adding current decoder %p", d.current.d)
-		}
-		d.decoders <- d.current.d
-		d.current.d = nil
-	}
 	if d.current.err != nil {
 		// Keep error state.
-		return blocking
+		return false
 	}
+	d.current.b = d.current.b[:0]
 
+	// SYNC:
+	if d.syncStream.enabled {
+		if !blocking {
+			return false
+		}
+		ok = d.nextBlockSync()
+		if !ok {
+			d.stashDecoder()
+		}
+		return ok
+	}
+
+	//ASYNC:
+	d.stashDecoder()
 	if blocking {
-		d.current.decodeOutput = <-d.current.output
+		d.current.decodeOutput, ok = <-d.current.output
 	} else {
 		select {
-		case d.current.decodeOutput = <-d.current.output:
+		case d.current.decodeOutput, ok = <-d.current.output:
 		default:
 			return false
 		}
 	}
-	if debug {
-		println("got", len(d.current.b), "bytes, error:", d.current.err)
+	if !ok {
+		// This should not happen, so signal error state...
+		d.current.err = io.ErrUnexpectedEOF
+		return false
+	}
+	next := d.current.decodeOutput
+	if next.d != nil && next.d.async.newHist != nil {
+		d.current.crc.Reset()
+	}
+	if debugDecoder {
+		var tmp [4]byte
+		binary.LittleEndian.PutUint32(tmp[:], uint32(xxhash.Sum64(next.b)))
+		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
+	}
+
+	if d.o.ignoreChecksum {
+		return true
+	}
+
+	if len(next.b) > 0 {
+		d.current.crc.Write(next.b)
+	}
+	if next.err == nil && next.d != nil && next.d.hasCRC {
+		got := uint32(d.current.crc.Sum64())
+		if got != next.d.checkCRC {
+			if debugDecoder {
+				printf("CRC Check Failed: %08x (got) != %08x (on stream)\n", got, next.d.checkCRC)
+			}
+			d.current.err = ErrCRCMismatch
+		} else {
+			if debugDecoder {
+				printf("CRC ok %08x\n", got)
+			}
+		}
+	}
+
+	return true
+}
+
+func (d *Decoder) nextBlockSync() (ok bool) {
+	if d.current.d == nil {
+		d.current.d = <-d.decoders
+	}
+	for len(d.current.b) == 0 {
+		if !d.syncStream.inFrame {
+			d.frame.history.reset()
+			d.current.err = d.frame.reset(&d.syncStream.br)
+			if d.current.err == nil {
+				d.current.err = d.setDict(d.frame)
+			}
+			if d.current.err != nil {
+				return false
+			}
+			if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
+				d.current.err = ErrDecoderSizeExceeded
+				return false
+			}
+
+			d.syncStream.decodedFrame = 0
+			d.syncStream.inFrame = true
+		}
+		d.current.err = d.frame.next(d.current.d)
+		if d.current.err != nil {
+			return false
+		}
+		d.frame.history.ensureBlock()
+		if debugDecoder {
+			println("History trimmed:", len(d.frame.history.b), "decoded already:", d.syncStream.decodedFrame)
+		}
+		histBefore := len(d.frame.history.b)
+		d.current.err = d.current.d.decodeBuf(&d.frame.history)
+
+		if d.current.err != nil {
+			println("error after:", d.current.err)
+			return false
+		}
+		d.current.b = d.frame.history.b[histBefore:]
+		if debugDecoder {
+			println("history after:", len(d.frame.history.b))
+		}
+
+		// Check frame size (before CRC)
+		d.syncStream.decodedFrame += uint64(len(d.current.b))
+		if d.syncStream.decodedFrame > d.frame.FrameContentSize {
+			if debugDecoder {
+				printf("DecodedFrame (%d) > FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
+			}
+			d.current.err = ErrFrameSizeExceeded
+			return false
+		}
+
+		// Check FCS
+		if d.current.d.Last && d.frame.FrameContentSize != fcsUnknown && d.syncStream.decodedFrame != d.frame.FrameContentSize {
+			if debugDecoder {
+				printf("DecodedFrame (%d) != FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
+			}
+			d.current.err = ErrFrameSizeMismatch
+			return false
+		}
+
+		// Update/Check CRC
+		if d.frame.HasCheckSum {
+			if !d.o.ignoreChecksum {
+				d.frame.crc.Write(d.current.b)
+			}
+			if d.current.d.Last {
+				if !d.o.ignoreChecksum {
+					d.current.err = d.frame.checkCRC()
+				} else {
+					d.current.err = d.frame.consumeCRC()
+				}
+				if d.current.err != nil {
+					println("CRC error:", d.current.err)
+					return false
+				}
+			}
+		}
+		d.syncStream.inFrame = !d.current.d.Last
 	}
 	return true
 }
 
+func (d *Decoder) stashDecoder() {
+	if d.current.d != nil {
+		if debugDecoder {
+			printf("re-adding current decoder %p", d.current.d)
+		}
+		d.decoders <- d.current.d
+		d.current.d = nil
+	}
+}
+
 // Close will release all resources.
 // It is NOT possible to reuse the decoder after this.
 func (d *Decoder) Close() {
@@ -397,10 +574,10 @@ func (d *Decoder) Close() {
 		return
 	}
 	d.drainOutput()
-	if d.stream != nil {
-		close(d.stream)
+	if d.current.cancel != nil {
+		d.current.cancel()
 		d.streamWg.Wait()
-		d.stream = nil
+		d.current.cancel = nil
 	}
 	if d.decoders != nil {
 		close(d.decoders)
@@ -451,100 +628,321 @@ type decodeOutput struct {
 	err error
 }
 
-type decodeStream struct {
-	r io.Reader
-
-	// Blocks ready to be written to output.
-	output chan decodeOutput
-
-	// cancel reading from the input
-	cancel chan struct{}
+func (d *Decoder) startSyncDecoder(r io.Reader) error {
+	d.frame.history.reset()
+	d.syncStream.br = readerWrapper{r: r}
+	d.syncStream.inFrame = false
+	d.syncStream.enabled = true
+	d.syncStream.decodedFrame = 0
+	return nil
 }
 
-// errEndOfStream indicates that everything from the stream was read.
-var errEndOfStream = errors.New("end-of-stream")
-
 // Create Decoder:
-// Spawn n block decoders. These accept tasks to decode a block.
-// Create goroutine that handles stream processing, this will send history to decoders as they are available.
-// Decoders update the history as they decode.
-// When a block is returned:
-// 		a) history is sent to the next decoder,
-// 		b) content written to CRC.
-// 		c) return data to WRITER.
-// 		d) wait for next block to return data.
-// Once WRITTEN, the decoders reused by the writer frame decoder for re-use.
-func (d *Decoder) startStreamDecoder(inStream chan decodeStream) {
+// ASYNC:
+// Spawn 3 go routines.
+// 0: Read frames and decode block literals.
+// 1: Decode sequences.
+// 2: Execute sequences, send to output.
+func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
 	defer d.streamWg.Done()
-	frame := newFrameDec(d.o)
-	for stream := range inStream {
-		if debug {
-			println("got new stream")
+	br := readerWrapper{r: r}
+
+	var seqDecode = make(chan *blockDec, d.o.concurrent)
+	var seqExecute = make(chan *blockDec, d.o.concurrent)
+
+	// Async 1: Decode sequences...
+	go func() {
+		var hist history
+		var hasErr bool
+
+		for block := range seqDecode {
+			if hasErr {
+				if block != nil {
+					seqExecute <- block
+				}
+				continue
+			}
+			if block.async.newHist != nil {
+				if debugDecoder {
+					println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
+				}
+				hist.reset()
+				hist.decoders = block.async.newHist.decoders
+				hist.recentOffsets = block.async.newHist.recentOffsets
+				hist.windowSize = block.async.newHist.windowSize
+				if block.async.newHist.dict != nil {
+					hist.setDict(block.async.newHist.dict)
+				}
+			}
+			if block.err != nil || block.Type != blockTypeCompressed {
+				hasErr = block.err != nil
+				seqExecute <- block
+				continue
+			}
+
+			hist.decoders.literals = block.async.literals
+			block.err = block.prepareSequences(block.async.seqData, &hist)
+			if debugDecoder && block.err != nil {
+				println("prepareSequences returned:", block.err)
+			}
+			hasErr = block.err != nil
+			if block.err == nil {
+				block.err = block.decodeSequences(&hist)
+				if debugDecoder && block.err != nil {
+					println("decodeSequences returned:", block.err)
+				}
+				hasErr = block.err != nil
+				//				block.async.sequence = hist.decoders.seq[:hist.decoders.nSeqs]
+				block.async.seqSize = hist.decoders.seqSize
+			}
+			seqExecute <- block
 		}
-		br := readerWrapper{r: stream.r}
-	decodeStream:
-		for {
-			frame.history.reset()
-			err := frame.reset(&br)
-			if debug && err != nil {
-				println("Frame decoder returned", err)
-			}
-			if err == nil && frame.DictionaryID != nil {
-				dict, ok := d.dicts[*frame.DictionaryID]
-				if !ok {
-					err = ErrUnknownDictionary
+		close(seqExecute)
+		hist.reset()
+	}()
+
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	// Async 3: Execute sequences...
+	frameHistCache := d.frame.history.b
+	go func() {
+		var hist history
+		var decodedFrame uint64
+		var fcs uint64
+		var hasErr bool
+		for block := range seqExecute {
+			out := decodeOutput{err: block.err, d: block}
+			if block.err != nil || hasErr {
+				hasErr = true
+				output <- out
+				continue
+			}
+			if block.async.newHist != nil {
+				if debugDecoder {
+					println("Async 2: new history")
+				}
+				hist.reset()
+				hist.windowSize = block.async.newHist.windowSize
+				hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
+				if block.async.newHist.dict != nil {
+					hist.setDict(block.async.newHist.dict)
+				}
+
+				if cap(hist.b) < hist.allocFrameBuffer {
+					if cap(frameHistCache) >= hist.allocFrameBuffer {
+						hist.b = frameHistCache
+					} else {
+						hist.b = make([]byte, 0, hist.allocFrameBuffer)
+						println("Alloc history sized", hist.allocFrameBuffer)
+					}
+				}
+				hist.b = hist.b[:0]
+				fcs = block.async.fcs
+				decodedFrame = 0
+			}
+			do := decodeOutput{err: block.err, d: block}
+			switch block.Type {
+			case blockTypeRLE:
+				if debugDecoder {
+					println("add rle block length:", block.RLESize)
+				}
+
+				if cap(block.dst) < int(block.RLESize) {
+					if block.lowMem {
+						block.dst = make([]byte, block.RLESize)
+					} else {
+						block.dst = make([]byte, maxCompressedBlockSize)
+					}
+				}
+				block.dst = block.dst[:block.RLESize]
+				v := block.data[0]
+				for i := range block.dst {
+					block.dst[i] = v
+				}
+				hist.append(block.dst)
+				do.b = block.dst
+			case blockTypeRaw:
+				if debugDecoder {
+					println("add raw block length:", len(block.data))
+				}
+				hist.append(block.data)
+				do.b = block.data
+			case blockTypeCompressed:
+				if debugDecoder {
+					println("execute with history length:", len(hist.b), "window:", hist.windowSize)
+				}
+				hist.decoders.seqSize = block.async.seqSize
+				hist.decoders.literals = block.async.literals
+				do.err = block.executeSequences(&hist)
+				hasErr = do.err != nil
+				if debugDecoder && hasErr {
+					println("executeSequences returned:", do.err)
+				}
+				do.b = block.dst
+			}
+			if !hasErr {
+				decodedFrame += uint64(len(do.b))
+				if decodedFrame > fcs {
+					println("fcs exceeded", block.Last, fcs, decodedFrame)
+					do.err = ErrFrameSizeExceeded
+					hasErr = true
+				} else if block.Last && fcs != fcsUnknown && decodedFrame != fcs {
+					do.err = ErrFrameSizeMismatch
+					hasErr = true
 				} else {
-					frame.history.setDict(&dict)
+					if debugDecoder {
+						println("fcs ok", block.Last, fcs, decodedFrame)
+					}
 				}
 			}
-			if err != nil {
-				stream.output <- decodeOutput{
-					err: err,
+			output <- do
+		}
+		close(output)
+		frameHistCache = hist.b
+		wg.Done()
+		if debugDecoder {
+			println("decoder goroutines finished")
+		}
+		hist.reset()
+	}()
+
+	var hist history
+decodeStream:
+	for {
+		var hasErr bool
+		hist.reset()
+		decodeBlock := func(block *blockDec) {
+			if hasErr {
+				if block != nil {
+					seqDecode <- block
 				}
-				break
+				return
+			}
+			if block.err != nil || block.Type != blockTypeCompressed {
+				hasErr = block.err != nil
+				seqDecode <- block
+				return
+			}
+
+			remain, err := block.decodeLiterals(block.data, &hist)
+			block.err = err
+			hasErr = block.err != nil
+			if err == nil {
+				block.async.literals = hist.decoders.literals
+				block.async.seqData = remain
+			} else if debugDecoder {
+				println("decodeLiterals error:", err)
+			}
+			seqDecode <- block
+		}
+		frame := d.frame
+		if debugDecoder {
+			println("New frame...")
+		}
+		var historySent bool
+		frame.history.reset()
+		err := frame.reset(&br)
+		if debugDecoder && err != nil {
+			println("Frame decoder returned", err)
+		}
+		if err == nil {
+			err = d.setDict(frame)
+		}
+		if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
+			if debugDecoder {
+				println("decoder size exceeded, fws:", d.frame.WindowSize, "> mws:", d.o.maxWindowSize)
 			}
-			if debug {
-				println("starting frame decoder")
+
+			err = ErrDecoderSizeExceeded
+		}
+		if err != nil {
+			select {
+			case <-ctx.Done():
+			case dec := <-d.decoders:
+				dec.sendErr(err)
+				decodeBlock(dec)
 			}
+			break decodeStream
+		}
 
-			// This goroutine will forward history between frames.
-			frame.frameDone.Add(1)
-			frame.initAsync()
+		// Go through all blocks of the frame.
+		for {
+			var dec *blockDec
+			select {
+			case <-ctx.Done():
+				break decodeStream
+			case dec = <-d.decoders:
+				// Once we have a decoder, we MUST return it.
+			}
+			err := frame.next(dec)
+			if !historySent {
+				h := frame.history
+				if debugDecoder {
+					println("Alloc History:", h.allocFrameBuffer)
+				}
+				hist.reset()
+				if h.dict != nil {
+					hist.setDict(h.dict)
+				}
+				dec.async.newHist = &h
+				dec.async.fcs = frame.FrameContentSize
+				historySent = true
+			} else {
+				dec.async.newHist = nil
+			}
+			if debugDecoder && err != nil {
+				println("next block returned error:", err)
+			}
+			dec.err = err
+			dec.hasCRC = false
+			if dec.Last && frame.HasCheckSum && err == nil {
+				crc, err := frame.rawInput.readSmall(4)
+				if len(crc) < 4 {
+					if err == nil {
+						err = io.ErrUnexpectedEOF
 
-			go frame.startDecoder(stream.output)
-		decodeFrame:
-			// Go through all blocks of the frame.
-			for {
-				dec := <-d.decoders
-				select {
-				case <-stream.cancel:
-					if !frame.sendErr(dec, io.EOF) {
-						// To not let the decoder dangle, send it back.
-						stream.output <- decodeOutput{d: dec}
 					}
-					break decodeStream
-				default:
-				}
-				err := frame.next(dec)
-				switch err {
-				case io.EOF:
-					// End of current frame, no error
-					println("EOF on next block")
-					break decodeFrame
-				case nil:
-					continue
-				default:
-					println("block decoder returned", err)
-					break decodeStream
+					println("CRC missing?", err)
+					dec.err = err
+				} else {
+					dec.checkCRC = binary.LittleEndian.Uint32(crc)
+					dec.hasCRC = true
+					if debugDecoder {
+						printf("found crc to check: %08x\n", dec.checkCRC)
+					}
 				}
 			}
-			// All blocks have started decoding, check if there are more frames.
-			println("waiting for done")
-			frame.frameDone.Wait()
-			println("done waiting...")
+			err = dec.err
+			last := dec.Last
+			decodeBlock(dec)
+			if err != nil {
+				break decodeStream
+			}
+			if last {
+				break
+			}
 		}
-		frame.frameDone.Wait()
-		println("Sending EOS")
-		stream.output <- decodeOutput{err: errEndOfStream}
 	}
+	close(seqDecode)
+	wg.Wait()
+	hist.reset()
+	d.frame.history.b = frameHistCache
+}
+
+func (d *Decoder) setDict(frame *frameDec) (err error) {
+	dict, ok := d.dicts[frame.DictionaryID]
+	if ok {
+		if debugDecoder {
+			println("setting dict", frame.DictionaryID)
+		}
+		frame.history.setDict(dict)
+	} else if frame.DictionaryID != 0 {
+		// A zero or missing dictionary id is ambiguous:
+		// either dictionary zero, or no dictionary. In particular,
+		// zstd --patch-from uses this id for the source file,
+		// so only return an error if the dictionary id is not zero.
+		err = ErrUnknownDictionary
+	}
+	return err
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index 284d384492..774c5f00fe 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -7,6 +7,7 @@ package zstd
 import (
 	"errors"
 	"fmt"
+	"math/bits"
 	"runtime"
 )
 
@@ -15,19 +16,28 @@ type DOption func(*decoderOptions) error
 
 // options retains accumulated state of multiple options.
 type decoderOptions struct {
-	lowMem         bool
-	concurrent     int
-	maxDecodedSize uint64
-	dicts          []dict
+	lowMem          bool
+	concurrent      int
+	maxDecodedSize  uint64
+	maxWindowSize   uint64
+	dicts           []*dict
+	ignoreChecksum  bool
+	limitToCap      bool
+	decodeBufsBelow int
 }
 
 func (o *decoderOptions) setDefault() {
 	*o = decoderOptions{
 		// use less ram: true for now, but may change.
-		lowMem:     true,
-		concurrent: runtime.GOMAXPROCS(0),
+		lowMem:          true,
+		concurrent:      runtime.GOMAXPROCS(0),
+		maxWindowSize:   MaxWindowSize,
+		decodeBufsBelow: 128 << 10,
 	}
-	o.maxDecodedSize = 1 << 63
+	if o.concurrent > 4 {
+		o.concurrent = 4
+	}
+	o.maxDecodedSize = 64 << 30
 }
 
 // WithDecoderLowmem will set whether to use a lower amount of memory,
@@ -36,16 +46,25 @@ func WithDecoderLowmem(b bool) DOption {
 	return func(o *decoderOptions) error { o.lowMem = b; return nil }
 }
 
-// WithDecoderConcurrency will set the concurrency,
-// meaning the maximum number of decoders to run concurrently.
-// The value supplied must be at least 1.
-// By default this will be set to GOMAXPROCS.
+// WithDecoderConcurrency sets the number of created decoders.
+// When decoding block with DecodeAll, this will limit the number
+// of possible concurrently running decodes.
+// When decoding streams, this will limit the number of
+// inflight blocks.
+// When decoding streams and setting maximum to 1,
+// no async decoding will be done.
+// When a value of 0 is provided GOMAXPROCS will be used.
+// By default this will be set to 4 or GOMAXPROCS, whatever is lower.
 func WithDecoderConcurrency(n int) DOption {
 	return func(o *decoderOptions) error {
-		if n <= 0 {
-			return fmt.Errorf("Concurrency must be at least 1")
+		if n < 0 {
+			return errors.New("concurrency must be at least 1")
+		}
+		if n == 0 {
+			o.concurrent = runtime.GOMAXPROCS(0)
+		} else {
+			o.concurrent = n
 		}
-		o.concurrent = n
 		return nil
 	}
 }
@@ -53,15 +72,14 @@ func WithDecoderConcurrency(n int) DOption {
 // WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
 // non-streaming operations or maximum window size for streaming operations.
 // This can be used to control memory usage of potentially hostile content.
-// For streaming operations, the maximum window size is capped at 1<<30 bytes.
-// Maximum and default is 1 << 63 bytes.
+// Maximum is 1 << 63 bytes. Default is 64GiB.
 func WithDecoderMaxMemory(n uint64) DOption {
 	return func(o *decoderOptions) error {
 		if n == 0 {
 			return errors.New("WithDecoderMaxMemory must be at least 1")
 		}
 		if n > 1<<63 {
-			return fmt.Errorf("WithDecoderMaxmemory must be less than 1 << 63")
+			return errors.New("WithDecoderMaxmemory must be less than 1 << 63")
 		}
 		o.maxDecodedSize = n
 		return nil
@@ -69,7 +87,13 @@ func WithDecoderMaxMemory(n uint64) DOption {
 }
 
 // WithDecoderDicts allows to register one or more dictionaries for the decoder.
-// If several dictionaries with the same ID is provided the last one will be used.
+//
+// Each slice in dict must be in the [dictionary format] produced by
+// "zstd --train" from the Zstandard reference implementation.
+//
+// If several dictionaries with the same ID are provided, the last one will be used.
+//
+// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
 func WithDecoderDicts(dicts ...[]byte) DOption {
 	return func(o *decoderOptions) error {
 		for _, b := range dicts {
@@ -77,8 +101,69 @@ func WithDecoderDicts(dicts ...[]byte) DOption {
 			if err != nil {
 				return err
 			}
-			o.dicts = append(o.dicts, *d)
+			o.dicts = append(o.dicts, d)
+		}
+		return nil
+	}
+}
+
+// WithDecoderDictRaw registers a dictionary that may be used by the decoder.
+// The slice content can be arbitrary data.
+func WithDecoderDictRaw(id uint32, content []byte) DOption {
+	return func(o *decoderOptions) error {
+		if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
+			return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
+		}
+		o.dicts = append(o.dicts, &dict{id: id, content: content, offsets: [3]int{1, 4, 8}})
+		return nil
+	}
+}
+
+// WithDecoderMaxWindow allows to set a maximum window size for decodes.
+// This allows rejecting packets that will cause big memory usage.
+// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.
+// If WithDecoderMaxMemory is set to a lower value, that will be used.
+// Default is 512MB, Maximum is ~3.75 TB as per zstandard spec.
+func WithDecoderMaxWindow(size uint64) DOption {
+	return func(o *decoderOptions) error {
+		if size < MinWindowSize {
+			return errors.New("WithMaxWindowSize must be at least 1KB, 1024 bytes")
+		}
+		if size > (1<<41)+7*(1<<38) {
+			return errors.New("WithMaxWindowSize must be less than (1<<41) + 7*(1<<38) ~ 3.75TB")
 		}
+		o.maxWindowSize = size
+		return nil
+	}
+}
+
+// WithDecodeAllCapLimit will limit DecodeAll to decoding cap(dst)-len(dst) bytes,
+// or any size set in WithDecoderMaxMemory.
+// This can be used to limit decoding to a specific maximum output size.
+// Disabled by default.
+func WithDecodeAllCapLimit(b bool) DOption {
+	return func(o *decoderOptions) error {
+		o.limitToCap = b
+		return nil
+	}
+}
+
+// WithDecodeBuffersBelow will fully decode readers that have a
+// `Bytes() []byte` and `Len() int` interface similar to bytes.Buffer.
+// This typically uses less allocations but will have the full decompressed object in memory.
+// Note that DecodeAllCapLimit will disable this, as well as giving a size of 0 or less.
+// Default is 128KiB.
+func WithDecodeBuffersBelow(size int) DOption {
+	return func(o *decoderOptions) error {
+		o.decodeBufsBelow = size
+		return nil
+	}
+}
+
+// IgnoreChecksum allows to forcibly ignore checksum checking.
+func IgnoreChecksum(b bool) DOption {
+	return func(o *decoderOptions) error {
+		o.ignoreChecksum = b
 		return nil
 	}
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/dict.go b/vendor/github.com/klauspost/compress/zstd/dict.go
index fa25a18d86..b7b83164bc 100644
--- a/vendor/github.com/klauspost/compress/zstd/dict.go
+++ b/vendor/github.com/klauspost/compress/zstd/dict.go
@@ -6,6 +6,8 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"math"
+	"sort"
 
 	"github.com/klauspost/compress/huff0"
 )
@@ -15,12 +17,14 @@ type dict struct {
 
 	litEnc              *huff0.Scratch
 	llDec, ofDec, mlDec sequenceDec
-	//llEnc, ofEnc, mlEnc []*fseEncoder
-	offsets [3]int
-	content []byte
+	offsets             [3]int
+	content             []byte
 }
 
-var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
+const dictMagic = "\x37\xa4\x30\xec"
+
+// Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB.
+const dictMaxLength = 1 << 31
 
 // ID returns the dictionary id or 0 if d is nil.
 func (d *dict) ID() uint32 {
@@ -30,14 +34,38 @@ func (d *dict) ID() uint32 {
 	return d.id
 }
 
-// DictContentSize returns the dictionary content size or 0 if d is nil.
-func (d *dict) DictContentSize() int {
+// ContentSize returns the dictionary content size or 0 if d is nil.
+func (d *dict) ContentSize() int {
 	if d == nil {
 		return 0
 	}
 	return len(d.content)
 }
 
+// Content returns the dictionary content.
+func (d *dict) Content() []byte {
+	if d == nil {
+		return nil
+	}
+	return d.content
+}
+
+// Offsets returns the initial offsets.
+func (d *dict) Offsets() [3]int {
+	if d == nil {
+		return [3]int{}
+	}
+	return d.offsets
+}
+
+// LitEncoder returns the literal encoder.
+func (d *dict) LitEncoder() *huff0.Scratch {
+	if d == nil {
+		return nil
+	}
+	return d.litEnc
+}
+
 // Load a dictionary as described in
 // https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
 func loadDict(b []byte) (*dict, error) {
@@ -50,7 +78,7 @@ func loadDict(b []byte) (*dict, error) {
 		ofDec: sequenceDec{fse: &fseDecoder{}},
 		mlDec: sequenceDec{fse: &fseDecoder{}},
 	}
-	if !bytes.Equal(b[:4], dictMagic[:]) {
+	if string(b[:4]) != dictMagic {
 		return nil, ErrMagicMismatch
 	}
 	d.id = binary.LittleEndian.Uint32(b[4:8])
@@ -62,7 +90,7 @@ func loadDict(b []byte) (*dict, error) {
 	var err error
 	d.litEnc, b, err = huff0.ReadTable(b[8:], nil)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("loading literal table: %w", err)
 	}
 	d.litEnc.Reuse = huff0.ReusePolicyMust
 
@@ -82,7 +110,7 @@ func loadDict(b []byte) (*dict, error) {
 			println("Transform table error:", err)
 			return err
 		}
-		if debug {
+		if debugDecoder || debugEncoder {
 			println("Read table ok", "symbolLen:", dec.symbolLen)
 		}
 		// Set decoders as predefined so they aren't reused.
@@ -120,3 +148,418 @@ func loadDict(b []byte) (*dict, error) {
 
 	return &d, nil
 }
+
+// InspectDictionary loads a zstd dictionary and provides functions to inspect the content.
+func InspectDictionary(b []byte) (interface {
+	ID() uint32
+	ContentSize() int
+	Content() []byte
+	Offsets() [3]int
+	LitEncoder() *huff0.Scratch
+}, error) {
+	initPredefined()
+	d, err := loadDict(b)
+	return d, err
+}
+
+type BuildDictOptions struct {
+	// Dictionary ID.
+	ID uint32
+
+	// Content to use to create dictionary tables.
+	Contents [][]byte
+
+	// History to use for all blocks.
+	History []byte
+
+	// Offsets to use.
+	Offsets [3]int
+
+	// CompatV155 will make the dictionary compatible with Zstd v1.5.5 and earlier.
+	// See https://github.com/facebook/zstd/issues/3724
+	CompatV155 bool
+
+	// Use the specified encoder level.
+	// The dictionary will be built using the specified encoder level,
+	// which will reflect speed and make the dictionary tailored for that level.
+	// If not set SpeedBestCompression will be used.
+	Level EncoderLevel
+
+	// DebugOut will write stats and other details here if set.
+	DebugOut io.Writer
+}
+
+func BuildDict(o BuildDictOptions) ([]byte, error) {
+	initPredefined()
+	hist := o.History
+	contents := o.Contents
+	debug := o.DebugOut != nil
+	println := func(args ...interface{}) {
+		if o.DebugOut != nil {
+			fmt.Fprintln(o.DebugOut, args...)
+		}
+	}
+	printf := func(s string, args ...interface{}) {
+		if o.DebugOut != nil {
+			fmt.Fprintf(o.DebugOut, s, args...)
+		}
+	}
+	print := func(args ...interface{}) {
+		if o.DebugOut != nil {
+			fmt.Fprint(o.DebugOut, args...)
+		}
+	}
+
+	if int64(len(hist)) > dictMaxLength {
+		return nil, fmt.Errorf("dictionary of size %d > %d", len(hist), int64(dictMaxLength))
+	}
+	if len(hist) < 8 {
+		return nil, fmt.Errorf("dictionary of size %d < %d", len(hist), 8)
+	}
+	if len(contents) == 0 {
+		return nil, errors.New("no content provided")
+	}
+	d := dict{
+		id:      o.ID,
+		litEnc:  nil,
+		llDec:   sequenceDec{},
+		ofDec:   sequenceDec{},
+		mlDec:   sequenceDec{},
+		offsets: o.Offsets,
+		content: hist,
+	}
+	block := blockEnc{lowMem: false}
+	block.init()
+	enc := encoder(&bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(maxMatchLen), bufferReset: math.MaxInt32 - int32(maxMatchLen*2), lowMem: false}})
+	if o.Level != 0 {
+		eOpts := encoderOptions{
+			level:      o.Level,
+			blockSize:  maxMatchLen,
+			windowSize: maxMatchLen,
+			dict:       &d,
+			lowMem:     false,
+		}
+		enc = eOpts.encoder()
+	} else {
+		o.Level = SpeedBestCompression
+	}
+	var (
+		remain [256]int
+		ll     [256]int
+		ml     [256]int
+		of     [256]int
+	)
+	addValues := func(dst *[256]int, src []byte) {
+		for _, v := range src {
+			dst[v]++
+		}
+	}
+	addHist := func(dst *[256]int, src *[256]uint32) {
+		for i, v := range src {
+			dst[i] += int(v)
+		}
+	}
+	seqs := 0
+	nUsed := 0
+	litTotal := 0
+	newOffsets := make(map[uint32]int, 1000)
+	for _, b := range contents {
+		block.reset(nil)
+		if len(b) < 8 {
+			continue
+		}
+		nUsed++
+		enc.Reset(&d, true)
+		enc.Encode(&block, b)
+		addValues(&remain, block.literals)
+		litTotal += len(block.literals)
+		if len(block.sequences) == 0 {
+			continue
+		}
+		seqs += len(block.sequences)
+		block.genCodes()
+		addHist(&ll, block.coders.llEnc.Histogram())
+		addHist(&ml, block.coders.mlEnc.Histogram())
+		addHist(&of, block.coders.ofEnc.Histogram())
+		for i, seq := range block.sequences {
+			if i > 3 {
+				break
+			}
+			offset := seq.offset
+			if offset == 0 {
+				continue
+			}
+			if int(offset) >= len(o.History) {
+				continue
+			}
+			if offset > 3 {
+				newOffsets[offset-3]++
+			} else {
+				newOffsets[uint32(o.Offsets[offset-1])]++
+			}
+		}
+	}
+	// Find most used offsets.
+	var sortedOffsets []uint32
+	for k := range newOffsets {
+		sortedOffsets = append(sortedOffsets, k)
+	}
+	sort.Slice(sortedOffsets, func(i, j int) bool {
+		a, b := sortedOffsets[i], sortedOffsets[j]
+		if a == b {
+			// Prefer the longer offset
+			return sortedOffsets[i] > sortedOffsets[j]
+		}
+		return newOffsets[sortedOffsets[i]] > newOffsets[sortedOffsets[j]]
+	})
+	if len(sortedOffsets) > 3 {
+		if debug {
+			print("Offsets:")
+			for i, v := range sortedOffsets {
+				if i > 20 {
+					break
+				}
+				printf("[%d: %d],", v, newOffsets[v])
+			}
+			println("")
+		}
+
+		sortedOffsets = sortedOffsets[:3]
+	}
+	for i, v := range sortedOffsets {
+		o.Offsets[i] = int(v)
+	}
+	if debug {
+		println("New repeat offsets", o.Offsets)
+	}
+
+	if nUsed == 0 || seqs == 0 {
+		return nil, fmt.Errorf("%d blocks, %d sequences found", nUsed, seqs)
+	}
+	if debug {
+		println("Sequences:", seqs, "Blocks:", nUsed, "Literals:", litTotal)
+	}
+	if seqs/nUsed < 512 {
+		// Use 512 as minimum.
+		nUsed = seqs / 512
+		if nUsed == 0 {
+			nUsed = 1
+		}
+	}
+	copyHist := func(dst *fseEncoder, src *[256]int) ([]byte, error) {
+		hist := dst.Histogram()
+		var maxSym uint8
+		var maxCount int
+		var fakeLength int
+		for i, v := range src {
+			if v > 0 {
+				v = v / nUsed
+				if v == 0 {
+					v = 1
+				}
+			}
+			if v > maxCount {
+				maxCount = v
+			}
+			if v != 0 {
+				maxSym = uint8(i)
+			}
+			fakeLength += v
+			hist[i] = uint32(v)
+		}
+
+		// Ensure we aren't trying to represent RLE.
+		if maxCount == fakeLength {
+			for i := range hist {
+				if uint8(i) == maxSym {
+					fakeLength++
+					maxSym++
+					hist[i+1] = 1
+					if maxSym > 1 {
+						break
+					}
+				}
+				if hist[0] == 0 {
+					fakeLength++
+					hist[i] = 1
+					if maxSym > 1 {
+						break
+					}
+				}
+			}
+		}
+
+		dst.HistogramFinished(maxSym, maxCount)
+		dst.reUsed = false
+		dst.useRLE = false
+		err := dst.normalizeCount(fakeLength)
+		if err != nil {
+			return nil, err
+		}
+		if debug {
+			println("RAW:", dst.count[:maxSym+1], "NORM:", dst.norm[:maxSym+1], "LEN:", fakeLength)
+		}
+		return dst.writeCount(nil)
+	}
+	if debug {
+		print("Literal lengths: ")
+	}
+	llTable, err := copyHist(block.coders.llEnc, &ll)
+	if err != nil {
+		return nil, err
+	}
+	if debug {
+		print("Match lengths: ")
+	}
+	mlTable, err := copyHist(block.coders.mlEnc, &ml)
+	if err != nil {
+		return nil, err
+	}
+	if debug {
+		print("Offsets: ")
+	}
+	ofTable, err := copyHist(block.coders.ofEnc, &of)
+	if err != nil {
+		return nil, err
+	}
+
+	// Literal table
+	avgSize := litTotal
+	if avgSize > huff0.BlockSizeMax/2 {
+		avgSize = huff0.BlockSizeMax / 2
+	}
+	huffBuff := make([]byte, 0, avgSize)
+	// Target size
+	div := litTotal / avgSize
+	if div < 1 {
+		div = 1
+	}
+	if debug {
+		println("Huffman weights:")
+	}
+	for i, n := range remain[:] {
+		if n > 0 {
+			n = n / div
+			// Allow all entries to be represented.
+			if n == 0 {
+				n = 1
+			}
+			huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
+			if debug {
+				printf("[%d: %d], ", i, n)
+			}
+		}
+	}
+	if o.CompatV155 && remain[255]/div == 0 {
+		huffBuff = append(huffBuff, 255)
+	}
+	scratch := &huff0.Scratch{TableLog: 11}
+	for tries := 0; tries < 255; tries++ {
+		scratch = &huff0.Scratch{TableLog: 11}
+		_, _, err = huff0.Compress1X(huffBuff, scratch)
+		if err == nil {
+			break
+		}
+		if debug {
+			printf("Try %d: Huffman error: %v\n", tries+1, err)
+		}
+		huffBuff = huffBuff[:0]
+		if tries == 250 {
+			if debug {
+				println("Huffman: Bailing out with predefined table")
+			}
+
+			// Bail out.... Just generate something
+			huffBuff = append(huffBuff, bytes.Repeat([]byte{255}, 10000)...)
+			for i := 0; i < 128; i++ {
+				huffBuff = append(huffBuff, byte(i))
+			}
+			continue
+		}
+		if errors.Is(err, huff0.ErrIncompressible) {
+			// Try truncating least common.
+			for i, n := range remain[:] {
+				if n > 0 {
+					n = n / (div * (i + 1))
+					if n > 0 {
+						huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
+					}
+				}
+			}
+			if o.CompatV155 && len(huffBuff) > 0 && huffBuff[len(huffBuff)-1] != 255 {
+				huffBuff = append(huffBuff, 255)
+			}
+			if len(huffBuff) == 0 {
+				huffBuff = append(huffBuff, 0, 255)
+			}
+		}
+		if errors.Is(err, huff0.ErrUseRLE) {
+			for i, n := range remain[:] {
+				n = n / (div * (i + 1))
+				// Allow all entries to be represented.
+				if n == 0 {
+					n = 1
+				}
+				huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
+			}
+		}
+	}
+
+	var out bytes.Buffer
+	out.Write([]byte(dictMagic))
+	out.Write(binary.LittleEndian.AppendUint32(nil, o.ID))
+	out.Write(scratch.OutTable)
+	if debug {
+		println("huff table:", len(scratch.OutTable), "bytes")
+		println("of table:", len(ofTable), "bytes")
+		println("ml table:", len(mlTable), "bytes")
+		println("ll table:", len(llTable), "bytes")
+	}
+	out.Write(ofTable)
+	out.Write(mlTable)
+	out.Write(llTable)
+	out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[0])))
+	out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[1])))
+	out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[2])))
+	out.Write(hist)
+	if debug {
+		_, err := loadDict(out.Bytes())
+		if err != nil {
+			panic(err)
+		}
+		i, err := InspectDictionary(out.Bytes())
+		if err != nil {
+			panic(err)
+		}
+		println("ID:", i.ID())
+		println("Content size:", i.ContentSize())
+		println("Encoder:", i.LitEncoder() != nil)
+		println("Offsets:", i.Offsets())
+		var totalSize int
+		for _, b := range contents {
+			totalSize += len(b)
+		}
+
+		encWith := func(opts ...EOption) int {
+			enc, err := NewWriter(nil, opts...)
+			if err != nil {
+				panic(err)
+			}
+			defer enc.Close()
+			var dst []byte
+			var totalSize int
+			for _, b := range contents {
+				dst = enc.EncodeAll(b, dst[:0])
+				totalSize += len(dst)
+			}
+			return totalSize
+		}
+		plain := encWith(WithEncoderLevel(o.Level))
+		withDict := encWith(WithEncoderLevel(o.Level), WithEncoderDict(out.Bytes()))
+		println("Input size:", totalSize)
+		println("Plain Compressed:", plain)
+		println("Dict Compressed:", withDict)
+		println("Saved:", plain-withDict, (plain-withDict)/len(contents), "bytes per input (rounded down)")
+	}
+	return out.Bytes(), nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go
index b1b7c6e6a7..5ca46038ad 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_base.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go
@@ -7,16 +7,22 @@ import (
 	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
+const (
+	dictShardBits = 6
+)
+
 type fastBase struct {
 	// cur is the offset at the start of hist
 	cur int32
 	// maximum offset. Should be at least 2x block size.
 	maxMatchOff int32
+	bufferReset int32
 	hist        []byte
 	crc         *xxhash.Digest
 	tmp         [8]byte
 	blk         *blockEnc
 	lastDictID  uint32
+	lowMem      bool
 }
 
 // CRC returns the underlying CRC writer.
@@ -33,8 +39,8 @@ func (e *fastBase) AppendCRC(dst []byte) []byte {
 
 // WindowSize returns the window size of the encoder,
 // or a window size small enough to contain the input size, if > 0.
-func (e *fastBase) WindowSize(size int) int32 {
-	if size > 0 && size < int(e.maxMatchOff) {
+func (e *fastBase) WindowSize(size int64) int32 {
+	if size > 0 && size < int64(e.maxMatchOff) {
 		b := int32(1) << uint(bits.Len(uint(size)))
 		// Keep minimum window.
 		if b < 1024 {
@@ -51,21 +57,16 @@ func (e *fastBase) Block() *blockEnc {
 }
 
 func (e *fastBase) addBlock(src []byte) int32 {
-	if debugAsserts && e.cur > bufferReset {
-		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
+	if debugAsserts && e.cur > e.bufferReset {
+		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, e.bufferReset))
 	}
 	// check if we have space already
 	if len(e.hist)+len(src) > cap(e.hist) {
 		if cap(e.hist) == 0 {
-			l := e.maxMatchOff * 2
-			// Make it at least 1MB.
-			if l < 1<<20 {
-				l = 1 << 20
-			}
-			e.hist = make([]byte, 0, l)
+			e.ensureHist(len(src))
 		} else {
-			if cap(e.hist) < int(e.maxMatchOff*2) {
-				panic("unexpected buffer size")
+			if cap(e.hist) < int(e.maxMatchOff+maxCompressedBlockSize) {
+				panic(fmt.Errorf("unexpected buffer cap %d, want at least %d with window %d", cap(e.hist), e.maxMatchOff+maxCompressedBlockSize, e.maxMatchOff))
 			}
 			// Move down
 			offset := int32(len(e.hist)) - e.maxMatchOff
@@ -79,6 +80,28 @@ func (e *fastBase) addBlock(src []byte) int32 {
 	return s
 }
 
+// ensureHist will ensure that history can keep at least this many bytes.
+func (e *fastBase) ensureHist(n int) {
+	if cap(e.hist) >= n {
+		return
+	}
+	l := e.maxMatchOff
+	if (e.lowMem && e.maxMatchOff > maxCompressedBlockSize) || e.maxMatchOff <= maxCompressedBlockSize {
+		l += maxCompressedBlockSize
+	} else {
+		l += e.maxMatchOff
+	}
+	// Make it at least 1MB.
+	if l < 1<<20 && !e.lowMem {
+		l = 1 << 20
+	}
+	// Make it at least the requested size.
+	if l < int32(n) {
+		l = int32(n)
+	}
+	e.hist = make([]byte, 0, l)
+}
+
 // useBlock will replace the block with the provided one,
 // but transfer recent offsets from the previous.
 func (e *fastBase) UseBlock(enc *blockEnc) {
@@ -86,11 +109,6 @@ func (e *fastBase) UseBlock(enc *blockEnc) {
 	e.blk = enc
 }
 
-func (e *fastBase) matchlenNoHist(s, t int32, src []byte) int32 {
-	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:], src[t:]))
-}
-
 func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
 	if debugAsserts {
 		if s < 0 {
@@ -109,15 +127,13 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
 			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
 		}
 	}
-
-	// Extend the match to be as long as possible.
 	return int32(matchLen(src[s:], src[t:]))
 }
 
 // Reset the encoding table.
 func (e *fastBase) resetBase(d *dict, singleBlock bool) {
 	if e.blk == nil {
-		e.blk = &blockEnc{}
+		e.blk = &blockEnc{lowMem: e.lowMem}
 		e.blk.init()
 	} else {
 		e.blk.reset(nil)
@@ -128,17 +144,19 @@ func (e *fastBase) resetBase(d *dict, singleBlock bool) {
 	} else {
 		e.crc.Reset()
 	}
-	if (!singleBlock || d.DictContentSize() > 0) && cap(e.hist) < int(e.maxMatchOff*2)+d.DictContentSize() {
-		l := e.maxMatchOff*2 + int32(d.DictContentSize())
-		// Make it at least 1MB.
-		if l < 1<<20 {
-			l = 1 << 20
+	e.blk.dictLitEnc = nil
+	if d != nil {
+		low := e.lowMem
+		if singleBlock {
+			e.lowMem = true
 		}
-		e.hist = make([]byte, 0, l)
+		e.ensureHist(d.ContentSize() + maxCompressedBlockSize)
+		e.lowMem = low
 	}
+
 	// We offset current position so everything will be out of reach.
 	// If above reset line, history will be purged.
-	if e.cur < bufferReset {
+	if e.cur < e.bufferReset {
 		e.cur += e.maxMatchOff + int32(len(e.hist))
 	}
 	e.hist = e.hist[:0]
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_best.go b/vendor/github.com/klauspost/compress/zstd/enc_best.go
index c4baa42c64..4613724e9d 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_best.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go
@@ -5,22 +5,61 @@
 package zstd
 
 import (
+	"bytes"
 	"fmt"
-	"math/bits"
+
+	"github.com/klauspost/compress"
 )
 
 const (
-	bestLongTableBits = 20                     // Bits used in the long match table
+	bestLongTableBits = 22                     // Bits used in the long match table
 	bestLongTableSize = 1 << bestLongTableBits // Size of the table
+	bestLongLen       = 8                      // Bytes used for table hash
 
 	// Note: Increasing the short table bits or making the hash shorter
 	// can actually lead to compression degradation since it will 'steal' more from the
 	// long match table and match offsets are quite big.
 	// This greatly depends on the type of input.
-	bestShortTableBits = 16                      // Bits used in the short match table
+	bestShortTableBits = 18                      // Bits used in the short match table
 	bestShortTableSize = 1 << bestShortTableBits // Size of the table
+	bestShortLen       = 4                       // Bytes used for table hash
+
 )
 
+type match struct {
+	offset int32
+	s      int32
+	length int32
+	rep    int32
+	est    int32
+}
+
+const highScore = maxMatchLen * 8
+
+// estBits will estimate output bits from predefined tables.
+func (m *match) estBits(bitsPerByte int32) {
+	mlc := mlCode(uint32(m.length - zstdMinMatch))
+	var ofc uint8
+	if m.rep < 0 {
+		ofc = ofCode(uint32(m.s-m.offset) + 3)
+	} else {
+		ofc = ofCode(uint32(m.rep) & 3)
+	}
+	// Cost, excluding
+	ofTT, mlTT := fsePredefEnc[tableOffsets].ct.symbolTT[ofc], fsePredefEnc[tableMatchLengths].ct.symbolTT[mlc]
+
+	// Add cost of match encoding...
+	m.est = int32(ofTT.outBits + mlTT.outBits)
+	m.est += int32(ofTT.deltaNbBits>>16 + mlTT.deltaNbBits>>16)
+	// Subtract savings compared to literal encoding...
+	m.est -= (m.length * bitsPerByte) >> 10
+	if m.est > 0 {
+		// Unlikely gain..
+		m.length = 0
+		m.est = highScore
+	}
+}
+
 // bestFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
 // The long match table contains the previous entry with the same hash,
 // effectively making it a "chain" of length 2.
@@ -45,14 +84,10 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = prevEntry{}
-			}
-			for i := range e.longTable[:] {
-				e.longTable[i] = prevEntry{}
-			}
+			e.table = [bestShortTableSize]prevEntry{}
+			e.longTable = [bestLongTableSize]prevEntry{}
 			e.cur = e.maxMatchOff
 			break
 		}
@@ -100,8 +135,20 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
 		break
 	}
 
+	// Add block to history
 	s := e.addBlock(src)
 	blk.size = len(src)
+
+	// Check RLE first
+	if len(src) > zstdMinMatch {
+		ml := matchLen(src[1:], src)
+		if ml == len(src)-1 {
+			blk.literals = append(blk.literals, src[0])
+			blk.sequences = append(blk.sequences, seq{litLen: 1, matchLen: uint32(len(src)-1) - zstdMinMatch, offset: 1 + 3})
+			return
+		}
+	}
+
 	if len(src) < minNonLiteralBlockSize {
 		blk.extraLits = len(src)
 		blk.literals = blk.literals[:len(src)]
@@ -109,14 +156,21 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
 		return
 	}
 
+	// Use this to estimate literal cost.
+	// Scaled by 10 bits.
+	bitsPerByte := int32((compress.ShannonEntropyBits(src) * 1024) / len(src))
+	// Huffman can never go < 1 bit/byte
+	if bitsPerByte < 1024 {
+		bitsPerByte = 1024
+	}
+
 	// Override src
 	src = e.hist
 	sLimit := int32(len(src)) - inputMargin
-	const kSearchStrength = 12
+	const kSearchStrength = 10
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
-	cv := load6432(src, s)
 
 	// Relative offsets
 	offset1 := int32(blk.recentOffsets[0])
@@ -130,9 +184,8 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	_ = addLiterals
 
-	if debug {
+	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
 	}
 
@@ -145,54 +198,104 @@ encodeLoop:
 			panic("offset0 was 0")
 		}
 
-		type match struct {
-			offset int32
-			s      int32
-			length int32
-			rep    int32
-		}
-		matchAt := func(offset int32, s int32, first uint32, rep int32) match {
-			if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
-				return match{offset: offset, s: s}
-			}
-			return match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
-		}
+		const goodEnough = 250
+
+		cv := load6432(src, s)
+
+		nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
+		nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
+		candidateL := e.longTable[nextHashL]
+		candidateS := e.table[nextHashS]
 
-		bestOf := func(a, b match) match {
-			aScore := b.s - a.s + a.length
-			bScore := a.s - b.s + b.length
-			if a.rep < 0 {
-				aScore = aScore - int32(bits.Len32(uint32(a.offset)))/8
+		// Set m to a match at offset if it looks like that will improve compression.
+		improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
+			delta := s - offset
+			if delta >= e.maxMatchOff || delta <= 0 || load3232(src, offset) != first {
+				return
 			}
-			if b.rep < 0 {
-				bScore = bScore - int32(bits.Len32(uint32(b.offset)))/8
+			// Try to quick reject if we already have a long match.
+			if m.length > 16 {
+				left := len(src) - int(m.s+m.length)
+				// If we are too close to the end, keep as is.
+				if left <= 0 {
+					return
+				}
+				checkLen := m.length - (s - m.s) - 8
+				if left > 2 && checkLen > 4 {
+					// Check 4 bytes, 4 bytes from the end of the current match.
+					a := load3232(src, offset+checkLen)
+					b := load3232(src, s+checkLen)
+					if a != b {
+						return
+					}
+				}
+			}
+			l := 4 + e.matchlen(s+4, offset+4, src)
+			if m.rep <= 0 {
+				// Extend candidate match backwards as far as possible.
+				// Do not extend repeats as we can assume they are optimal
+				// and offsets change if s == nextEmit.
+				tMin := s - e.maxMatchOff
+				if tMin < 0 {
+					tMin = 0
+				}
+				for offset > tMin && s > nextEmit && src[offset-1] == src[s-1] && l < maxMatchLength {
+					s--
+					offset--
+					l++
+				}
+			}
+			if debugAsserts {
+				if offset >= s {
+					panic(fmt.Sprintf("offset: %d - s:%d - rep: %d - cur :%d - max: %d", offset, s, rep, e.cur, e.maxMatchOff))
+				}
+				if !bytes.Equal(src[s:s+l], src[offset:offset+l]) {
+					panic(fmt.Sprintf("second match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
+				}
 			}
-			if aScore >= bScore {
-				return a
+			cand := match{offset: offset, s: s, length: l, rep: rep}
+			cand.estBits(bitsPerByte)
+			if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
+				*m = cand
 			}
-			return b
 		}
-		const goodEnough = 100
 
-		nextHashL := hash8(cv, bestLongTableBits)
-		nextHashS := hash4x64(cv, bestShortTableBits)
-		candidateL := e.longTable[nextHashL]
-		candidateS := e.table[nextHashS]
+		best := match{s: s, est: highScore}
+		improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)
 
-		best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
-		best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
-		best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1))
 		if canRepeat && best.length < goodEnough {
-			best = bestOf(best, matchAt(s-offset1+1, s+1, uint32(cv>>8), 1))
-			best = bestOf(best, matchAt(s-offset2+1, s+1, uint32(cv>>8), 2))
-			best = bestOf(best, matchAt(s-offset3+1, s+1, uint32(cv>>8), 3))
-			best = bestOf(best, matchAt(s-offset1+3, s+3, uint32(cv>>24), 1))
-			best = bestOf(best, matchAt(s-offset2+3, s+3, uint32(cv>>24), 2))
-			best = bestOf(best, matchAt(s-offset3+3, s+3, uint32(cv>>24), 3))
+			if s == nextEmit {
+				// Check repeats straight after a match.
+				improve(&best, s-offset2, s, uint32(cv), 1|4)
+				improve(&best, s-offset3, s, uint32(cv), 2|4)
+				if offset1 > 1 {
+					improve(&best, s-(offset1-1), s, uint32(cv), 3|4)
+				}
+			}
+
+			// If either no match or a non-repeat match, check at + 1
+			if best.rep <= 0 {
+				cv32 := uint32(cv >> 8)
+				spp := s + 1
+				improve(&best, spp-offset1, spp, cv32, 1)
+				improve(&best, spp-offset2, spp, cv32, 2)
+				improve(&best, spp-offset3, spp, cv32, 3)
+				if best.rep < 0 {
+					cv32 = uint32(cv >> 24)
+					spp += 2
+					improve(&best, spp-offset1, spp, cv32, 1)
+					improve(&best, spp-offset2, spp, cv32, 2)
+					improve(&best, spp-offset3, spp, cv32, 3)
+				}
+			}
 		}
 		// Load next and check...
 		e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: candidateL.offset}
 		e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: candidateS.offset}
+		index0 := s + 1
 
 		// Look far ahead, unless we have a really long match already...
 		if best.length < goodEnough {
@@ -202,92 +305,117 @@ encodeLoop:
 				if s >= sLimit {
 					break encodeLoop
 				}
-				cv = load6432(src, s)
 				continue
 			}
 
-			s++
-			candidateS = e.table[hash4x64(cv>>8, bestShortTableBits)]
-			cv = load6432(src, s)
-			cv2 := load6432(src, s+1)
-			candidateL = e.longTable[hash8(cv, bestLongTableBits)]
-			candidateL2 := e.longTable[hash8(cv2, bestLongTableBits)]
-
-			best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
-			best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1))
-			best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
-			best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
-			best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
+			candidateS = e.table[hashLen(cv>>8, bestShortTableBits, bestShortLen)]
+			cv = load6432(src, s+1)
+			cv2 := load6432(src, s+2)
+			candidateL = e.longTable[hashLen(cv, bestLongTableBits, bestLongLen)]
+			candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]
+
+			// Short at s+1
+			improve(&best, candidateS.offset-e.cur, s+1, uint32(cv), -1)
+			// Long at s+1, s+2
+			improve(&best, candidateL.offset-e.cur, s+1, uint32(cv), -1)
+			improve(&best, candidateL.prev-e.cur, s+1, uint32(cv), -1)
+			improve(&best, candidateL2.offset-e.cur, s+2, uint32(cv2), -1)
+			improve(&best, candidateL2.prev-e.cur, s+2, uint32(cv2), -1)
+			if false {
+				// Short at s+3.
+				// Too often worse...
+				improve(&best, e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+3, uint32(cv2>>8), -1)
+			}
+
+			// Start check at a fixed offset to allow for a few mismatches.
+			// For this compression level 2 yields the best results.
+			// We cannot do this if we have already indexed this position.
+			const skipBeginning = 2
+			if best.s > s-skipBeginning {
+				// See if we can find a better match by checking where the current best ends.
+				// Use that offset to see if we can find a better full match.
+				if sAt := best.s + best.length; sAt < sLimit {
+					nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
+					candidateEnd := e.longTable[nextHashL]
+
+					if off := candidateEnd.offset - e.cur - best.length + skipBeginning; off >= 0 {
+						improve(&best, off, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
+						if off := candidateEnd.prev - e.cur - best.length + skipBeginning; off >= 0 {
+							improve(&best, off, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
+						}
+					}
+				}
+			}
+		}
+
+		if debugAsserts {
+			if best.offset >= best.s {
+				panic(fmt.Sprintf("best.offset > s: %d >= %d", best.offset, best.s))
+			}
+			if best.s < nextEmit {
+				panic(fmt.Sprintf("s %d < nextEmit %d", best.s, nextEmit))
+			}
+			if best.offset < s-e.maxMatchOff {
+				panic(fmt.Sprintf("best.offset < s-e.maxMatchOff: %d < %d", best.offset, s-e.maxMatchOff))
+			}
+			if !bytes.Equal(src[best.s:best.s+best.length], src[best.offset:best.offset+best.length]) {
+				panic(fmt.Sprintf("match mismatch: %v != %v", src[best.s:best.s+best.length], src[best.offset:best.offset+best.length]))
+			}
 		}
 
 		// We have a match, we can store the forward value
+		s = best.s
 		if best.rep > 0 {
-			s = best.s
 			var seq seq
 			seq.matchLen = uint32(best.length - zstdMinMatch)
+			addLiterals(&seq, best.s)
 
-			// We might be able to match backwards.
-			// Extend as long as we can.
-			start := best.s
-			// We end the search early, so we don't risk 0 literals
-			// and have to do special offset treatment.
-			startLimit := nextEmit + 1
-
-			tMin := s - e.maxMatchOff
-			if tMin < 0 {
-				tMin = 0
-			}
-			repIndex := best.offset
-			for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-				repIndex--
-				start--
-				seq.matchLen++
-			}
-			addLiterals(&seq, start)
-
-			// rep 0
-			seq.offset = uint32(best.rep)
+			// Repeat. If bit 4 is set, this is a non-lit repeat.
+			seq.offset = uint32(best.rep & 3)
 			if debugSequences {
-				println("repeat sequence", seq, "next s:", s)
+				println("repeat sequence", seq, "next s:", best.s, "off:", best.s-best.offset)
 			}
 			blk.sequences = append(blk.sequences, seq)
 
-			// Index match start+1 (long) -> s - 1
-			index0 := s
+			// Index old s + 1 -> s - 1
 			s = best.s + best.length
-
 			nextEmit = s
-			if s >= sLimit {
-				if debug {
-					println("repeat ended", s, best.length)
 
-				}
-				break encodeLoop
-			}
 			// Index skipped...
+			end := s
+			if s > sLimit+4 {
+				end = sLimit + 4
+			}
 			off := index0 + e.cur
-			for index0 < s-1 {
+			for index0 < end {
 				cv0 := load6432(src, index0)
-				h0 := hash8(cv0, bestLongTableBits)
-				h1 := hash4x64(cv0, bestShortTableBits)
+				h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
+				h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
 				e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 				e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
 				off++
 				index0++
 			}
+
 			switch best.rep {
-			case 2:
+			case 2, 4 | 1:
 				offset1, offset2 = offset2, offset1
-			case 3:
+			case 3, 4 | 2:
 				offset1, offset2, offset3 = offset3, offset1, offset2
+			case 4 | 3:
+				offset1, offset2, offset3 = offset1-1, offset1, offset2
+			}
+			if s >= sLimit {
+				if debugEncoder {
+					println("repeat ended", s, best.length)
+				}
+				break encodeLoop
 			}
-			cv = load6432(src, s)
 			continue
 		}
 
 		// A 4-byte match has been found. Update recent offsets.
 		// We'll later see if more than 4 bytes.
-		s = best.s
 		t := best.offset
 		offset1, offset2, offset3 = s-t, offset1, offset2
 
@@ -295,26 +423,13 @@ encodeLoop:
 			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 
-		if debugAsserts && canRepeat && int(offset1) > len(src) {
+		if debugAsserts && int(offset1) > len(src) {
 			panic("invalid offset")
 		}
 
-		// Extend the n-byte match as long as possible.
-		l := best.length
-
-		// Extend backwards
-		tMin := s - e.maxMatchOff
-		if tMin < 0 {
-			tMin = 0
-		}
-		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
-			s--
-			t--
-			l++
-		}
-
 		// Write our sequence
 		var seq seq
+		l := best.length
 		seq.litLen = uint32(s - nextEmit)
 		seq.matchLen = uint32(l - zstdMinMatch)
 		if seq.litLen > 0 {
@@ -327,65 +442,25 @@ encodeLoop:
 		}
 		blk.sequences = append(blk.sequences, seq)
 		nextEmit = s
-		if s >= sLimit {
-			break encodeLoop
+
+		// Index old s + 1 -> s - 1 or sLimit
+		end := s
+		if s > sLimit-4 {
+			end = sLimit - 4
 		}
 
-		// Index match start+1 (long) -> s - 1
-		index0 := s - l + 1
-		// every entry
-		for index0 < s-1 {
+		off := index0 + e.cur
+		for index0 < end {
 			cv0 := load6432(src, index0)
-			h0 := hash8(cv0, bestLongTableBits)
-			h1 := hash4x64(cv0, bestShortTableBits)
-			off := index0 + e.cur
+			h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
+			h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
 			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 			e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
 			index0++
+			off++
 		}
-
-		cv = load6432(src, s)
-		if !canRepeat {
-			continue
-		}
-
-		// Check offset 2
-		for {
-			o2 := s - offset2
-			if load3232(src, o2) != uint32(cv) {
-				// Do regular search
-				break
-			}
-
-			// Store this, since we have it.
-			nextHashS := hash4x64(cv, bestShortTableBits)
-			nextHashL := hash8(cv, bestLongTableBits)
-
-			// We have at least 4 byte match.
-			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
-
-			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
-			e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: e.table[nextHashS].offset}
-			seq.matchLen = uint32(l) - zstdMinMatch
-			seq.litLen = 0
-
-			// Since litlen is always 0, this is offset 1.
-			seq.offset = 1
-			s += l
-			nextEmit = s
-			if debugSequences {
-				println("sequence", seq, "next s:", s)
-			}
-			blk.sequences = append(blk.sequences, seq)
-
-			// Swap offset 1 and 2.
-			offset1, offset2 = offset2, offset1
-			if s >= sLimit {
-				// Finished
-				break encodeLoop
-			}
-			cv = load6432(src, s)
+		if s >= sLimit {
+			break encodeLoop
 		}
 	}
 
@@ -396,7 +471,7 @@ encodeLoop:
 	blk.recentOffsets[0] = uint32(offset1)
 	blk.recentOffsets[1] = uint32(offset2)
 	blk.recentOffsets[2] = uint32(offset3)
-	if debug {
+	if debugEncoder {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 }
@@ -405,10 +480,11 @@ encodeLoop:
 // Most notable difference is that src will not be copied for history and
 // we do not need to check for max match length.
 func (e *bestFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.ensureHist(len(src))
 	e.Encode(blk, src)
 }
 
-// ResetDict will reset and set a dictionary if not nil
+// Reset will reset and set a dictionary if not nil
 func (e *bestFastEncoder) Reset(d *dict, singleBlock bool) {
 	e.resetBase(d, singleBlock)
 	if d == nil {
@@ -424,10 +500,10 @@ func (e *bestFastEncoder) Reset(d *dict, singleBlock bool) {
 			const hashLog = bestShortTableBits
 
 			cv := load6432(d.content, i-e.maxMatchOff)
-			nextHash := hash4x64(cv, hashLog)      // 0 -> 4
-			nextHash1 := hash4x64(cv>>8, hashLog)  // 1 -> 5
-			nextHash2 := hash4x64(cv>>16, hashLog) // 2 -> 6
-			nextHash3 := hash4x64(cv>>24, hashLog) // 3 -> 7
+			nextHash := hashLen(cv, hashLog, bestShortLen)      // 0 -> 4
+			nextHash1 := hashLen(cv>>8, hashLog, bestShortLen)  // 1 -> 5
+			nextHash2 := hashLen(cv>>16, hashLog, bestShortLen) // 2 -> 6
+			nextHash3 := hashLen(cv>>24, hashLog, bestShortLen) // 3 -> 7
 			e.dictTable[nextHash] = prevEntry{
 				prev:   e.dictTable[nextHash].offset,
 				offset: i,
@@ -455,7 +531,7 @@ func (e *bestFastEncoder) Reset(d *dict, singleBlock bool) {
 		}
 		if len(d.content) >= 8 {
 			cv := load6432(d.content, 0)
-			h := hash8(cv, bestLongTableBits)
+			h := hashLen(cv, bestLongTableBits, bestLongLen)
 			e.dictLongTable[h] = prevEntry{
 				offset: e.maxMatchOff,
 				prev:   e.dictLongTable[h].offset,
@@ -465,7 +541,7 @@ func (e *bestFastEncoder) Reset(d *dict, singleBlock bool) {
 			off := 8 // First to read
 			for i := e.maxMatchOff + 1; i < end; i++ {
 				cv = cv>>8 | (uint64(d.content[off]) << 56)
-				h := hash8(cv, bestLongTableBits)
+				h := hashLen(cv, bestLongTableBits, bestLongLen)
 				e.dictLongTable[h] = prevEntry{
 					offset: i,
 					prev:   e.dictLongTable[h].offset,
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
index 94a5343d00..84a79fde76 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -9,6 +9,7 @@ import "fmt"
 const (
 	betterLongTableBits = 19                       // Bits used in the long match table
 	betterLongTableSize = 1 << betterLongTableBits // Size of the table
+	betterLongLen       = 8                        // Bytes used for table hash
 
 	// Note: Increasing the short table bits or making the hash shorter
 	// can actually lead to compression degradation since it will 'steal' more from the
@@ -16,29 +17,575 @@ const (
 	// This greatly depends on the type of input.
 	betterShortTableBits = 13                        // Bits used in the short match table
 	betterShortTableSize = 1 << betterShortTableBits // Size of the table
+	betterShortLen       = 5                         // Bytes used for table hash
+
+	betterLongTableShardCnt  = 1 << (betterLongTableBits - dictShardBits)    // Number of shards in the table
+	betterLongTableShardSize = betterLongTableSize / betterLongTableShardCnt // Size of an individual shard
+
+	betterShortTableShardCnt  = 1 << (betterShortTableBits - dictShardBits)     // Number of shards in the table
+	betterShortTableShardSize = betterShortTableSize / betterShortTableShardCnt // Size of an individual shard
 )
 
-type prevEntry struct {
-	offset int32
-	prev   int32
+type prevEntry struct {
+	offset int32
+	prev   int32
+}
+
+// betterFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
+// The long match table contains the previous entry with the same hash,
+// effectively making it a "chain" of length 2.
+// When we find a long match we choose between the two values and select the longest.
+// When we find a short match, after checking the long, we check if we can find a long at n+1
+// and that it is longer (lazy matching).
+type betterFastEncoder struct {
+	fastBase
+	table     [betterShortTableSize]tableEntry
+	longTable [betterLongTableSize]prevEntry
+}
+
+type betterFastEncoderDict struct {
+	betterFastEncoder
+	dictTable            []tableEntry
+	dictLongTable        []prevEntry
+	shortTableShardDirty [betterShortTableShardCnt]bool
+	longTableShardDirty  [betterLongTableShardCnt]bool
+	allDirty             bool
+}
+
+// Encode improves compression...
+func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
+		if len(e.hist) == 0 {
+			e.table = [betterShortTableSize]tableEntry{}
+			e.longTable = [betterLongTableSize]prevEntry{}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			v2 := e.longTable[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.longTable[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+	// Add block to history
+	s := e.addBlock(src)
+	blk.size = len(src)
+
+	// Check RLE first
+	if len(src) > zstdMinMatch {
+		ml := matchLen(src[1:], src)
+		if ml == len(src)-1 {
+			blk.literals = append(blk.literals, src[0])
+			blk.sequences = append(blk.sequences, seq{litLen: 1, matchLen: uint32(len(src)-1) - zstdMinMatch, offset: 1 + 3})
+			return
+		}
+	}
+
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 9
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debugEncoder {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+		var matched, index0 int32
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			off := s + e.cur
+			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
+			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+			index0 = s + 1
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(length - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					// Index match start+1 (long) -> s - 1
+					index0 := s + repOff
+					s += length + repOff
+
+					nextEmit = s
+					if s >= sLimit {
+						if debugEncoder {
+							println("repeat ended", s, length)
+
+						}
+						break encodeLoop
+					}
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					continue
+				}
+				const repOff2 = 1
+
+				// We deviate from the reference encoder and also check offset 2.
+				// Still slower and not much better, so disabled.
+				// repIndex = s - offset2 + repOff2
+				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
+					// Consider history as well.
+					var seq seq
+					length := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+
+					seq.matchLen = uint32(length - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff2
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 2
+					seq.offset = 2
+					if debugSequences {
+						println("repeat sequence 2", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					s += length + repOff2
+					nextEmit = s
+					if s >= sLimit {
+						if debugEncoder {
+							println("repeat ended", s, length)
+
+						}
+						break encodeLoop
+					}
+
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					// Swap offsets
+					offset1, offset2 = offset2, offset1
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := candidateL.offset - e.cur
+			coffsetLP := candidateL.prev - e.cur
+
+			// Check if we have a long match.
+			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetL+8, src) + 8
+				t = coffsetL
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+
+				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+					// Found a long match, at least 8 bytes.
+					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
+					if prevMatch > matched {
+						matched = prevMatch
+						t = coffsetLP
+					}
+					if debugAsserts && s <= t {
+						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+					}
+					if debugAsserts && s-t > e.maxMatchOff {
+						panic("s - t >e.maxMatchOff")
+					}
+					if debugMatches {
+						println("long match")
+					}
+				}
+				break
+			}
+
+			// Check if we have a long match on prev.
+			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
+				t = coffsetLP
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			coffsetS := candidateS.offset - e.cur
+
+			// Check if we have a short match.
+			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				matched = e.matchlen(s+4, coffsetS+4, src) + 4
+
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hashLen(cv, betterLongTableBits, betterLongLen)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = candidateL.offset - e.cur
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("long match (after short)")
+						}
+						break
+					}
+				}
+
+				// Check prev long...
+				coffsetL = candidateL.prev - e.cur
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match (after short)")
+						}
+						break
+					}
+				}
+				t = coffsetS
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// Try to find a better match by searching for a long match at the end of the current best match
+		if s+matched < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is around 3 bytes, but depends on input.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 3
+
+			nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
+			s2 := s + skipBeginning
+			cv := load3232(src, s2)
+			candidateL := e.longTable[nextHashL]
+			coffsetL := candidateL.offset - e.cur - matched + skipBeginning
+			if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+				// Found a long match, at least 4 bytes.
+				matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
+				if matchedNext > matched {
+					t = coffsetL
+					s = s2
+					matched = matchedNext
+					if debugMatches {
+						println("long match at end-of-match")
+					}
+				}
+			}
+
+			// Check prev long...
+			if true {
+				coffsetL = candidateL.prev - e.cur - matched + skipBeginning
+				if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+					// Found a long match, at least 4 bytes.
+					matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
+					if matchedNext > matched {
+						t = coffsetL
+						s = s2
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match at end-of-match")
+						}
+					}
+				}
+			}
+		}
+		// A match has been found. Update recent offsets.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the n-byte match as long as possible.
+		l := matched
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) -> s - 1
+		off := index0 + e.cur
+		for index0 < s-1 {
+			cv0 := load6432(src, index0)
+			cv1 := cv0 >> 8
+			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
+			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			index0 += 2
+			off += 2
+		}
+
+		cv = load6432(src, s)
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debugEncoder {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
 }
 
-// betterFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
-// The long match table contains the previous entry with the same hash,
-// effectively making it a "chain" of length 2.
-// When we find a long match we choose between the two values and select the longest.
-// When we find a short match, after checking the long, we check if we can find a long at n+1
-// and that it is longer (lazy matching).
-type betterFastEncoder struct {
-	fastBase
-	table         [betterShortTableSize]tableEntry
-	longTable     [betterLongTableSize]prevEntry
-	dictTable     []tableEntry
-	dictLongTable []prevEntry
+// EncodeNoHist will encode a block with no history and no following blocks.
+// Most notable difference is that src will not be copied for history and
+// we do not need to check for max match length.
+func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.ensureHist(len(src))
+	e.Encode(blk, src)
 }
 
 // Encode improves compression...
-func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
+func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
 	const (
 		// Input margin is the number of bytes we read (8)
 		// and the maximum we will read ahead (2)
@@ -47,7 +594,7 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
@@ -56,6 +603,7 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
 				e.longTable[i] = prevEntry{}
 			}
 			e.cur = e.maxMatchOff
+			e.allDirty = true
 			break
 		}
 		// Shift down everything in the table that isn't already too far away.
@@ -88,6 +636,7 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
 				prev:   v2,
 			}
 		}
+		e.allDirty = true
 		e.cur = e.maxMatchOff
 		break
 	}
@@ -125,7 +674,7 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
 	}
 
@@ -134,15 +683,15 @@ encodeLoop:
 		var t int32
 		// We allow the encoder to optionally turn off repeat offsets across blocks
 		canRepeat := len(blk.sequences) > 2
-		var matched int32
+		var matched, index0 int32
 
 		for {
 			if debugAsserts && canRepeat && offset1 == 0 {
 				panic("offset0 was 0")
 			}
 
-			nextHashS := hash5(cv, betterShortTableBits)
-			nextHashL := hash8(cv, betterLongTableBits)
+			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
 
@@ -150,15 +699,18 @@ encodeLoop:
 			repIndex := s - offset1 + repOff
 			off := s + e.cur
 			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
+			e.markLongShardDirty(nextHashL)
 			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+			e.markShortShardDirty(nextHashS)
+			index0 = s + 1
 
 			if canRepeat {
 				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 					// Consider history as well.
 					var seq seq
-					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 
-					seq.matchLen = uint32(lenght - zstdMinMatch)
+					seq.matchLen = uint32(length - zstdMinMatch)
 
 					// We might be able to match backwards.
 					// Extend as long as we can.
@@ -186,13 +738,12 @@ encodeLoop:
 					blk.sequences = append(blk.sequences, seq)
 
 					// Index match start+1 (long) -> s - 1
-					index0 := s + repOff
-					s += lenght + repOff
+					s += length + repOff
 
 					nextEmit = s
 					if s >= sLimit {
-						if debug {
-							println("repeat ended", s, lenght)
+						if debugEncoder {
+							println("repeat ended", s, length)
 
 						}
 						break encodeLoop
@@ -201,10 +752,13 @@ encodeLoop:
 					for index0 < s-1 {
 						cv0 := load6432(src, index0)
 						cv1 := cv0 >> 8
-						h0 := hash8(cv0, betterLongTableBits)
+						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
 						off := index0 + e.cur
 						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markLongShardDirty(h0)
+						h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
+						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markShortShardDirty(h1)
 						index0 += 2
 					}
 					cv = load6432(src, s)
@@ -218,9 +772,9 @@ encodeLoop:
 				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
 					// Consider history as well.
 					var seq seq
-					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+					length := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
 
-					seq.matchLen = uint32(lenght - zstdMinMatch)
+					seq.matchLen = uint32(length - zstdMinMatch)
 
 					// We might be able to match backwards.
 					// Extend as long as we can.
@@ -247,12 +801,11 @@ encodeLoop:
 					}
 					blk.sequences = append(blk.sequences, seq)
 
-					index0 := s + repOff2
-					s += lenght + repOff2
+					s += length + repOff2
 					nextEmit = s
 					if s >= sLimit {
-						if debug {
-							println("repeat ended", s, lenght)
+						if debugEncoder {
+							println("repeat ended", s, length)
 
 						}
 						break encodeLoop
@@ -262,10 +815,13 @@ encodeLoop:
 					for index0 < s-1 {
 						cv0 := load6432(src, index0)
 						cv1 := cv0 >> 8
-						h0 := hash8(cv0, betterLongTableBits)
+						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
 						off := index0 + e.cur
 						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markLongShardDirty(h0)
+						h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
+						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markShortShardDirty(h1)
 						index0 += 2
 					}
 					cv = load6432(src, s)
@@ -340,12 +896,13 @@ encodeLoop:
 				// See if we can find a long match at s+1
 				const checkAt = 1
 				cv := load6432(src, s+checkAt)
-				nextHashL = hash8(cv, betterLongTableBits)
+				nextHashL = hashLen(cv, betterLongTableBits, betterLongLen)
 				candidateL = e.longTable[nextHashL]
 				coffsetL = candidateL.offset - e.cur
 
 				// We can store it, since we have at least a 4 byte match.
 				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
+				e.markLongShardDirty(nextHashL)
 				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
 					// Found a long match, at least 8 bytes.
 					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
@@ -398,9 +955,41 @@ encodeLoop:
 			}
 			cv = load6432(src, s)
 		}
+		// Try to find a better match by searching for a long match at the end of the current best match
+		if s+matched < sLimit {
+			nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
+			cv := load3232(src, s)
+			candidateL := e.longTable[nextHashL]
+			coffsetL := candidateL.offset - e.cur - matched
+			if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+				// Found a long match, at least 4 bytes.
+				matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+				if matchedNext > matched {
+					t = coffsetL
+					matched = matchedNext
+					if debugMatches {
+						println("long match at end-of-match")
+					}
+				}
+			}
 
-		// A 4-byte match has been found. Update recent offsets.
-		// We'll later see if more than 4 bytes.
+			// Check prev long...
+			if true {
+				coffsetL = candidateL.prev - e.cur - matched
+				if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+					// Found a long match, at least 4 bytes.
+					matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+					if matchedNext > matched {
+						t = coffsetL
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match at end-of-match")
+						}
+					}
+				}
+			}
+		}
+		// A match has been found. Update recent offsets.
 		offset2 = offset1
 		offset1 = s - t
 
@@ -445,15 +1034,18 @@ encodeLoop:
 		}
 
 		// Index match start+1 (long) -> s - 1
-		index0 := s - l + 1
+		off := index0 + e.cur
 		for index0 < s-1 {
 			cv0 := load6432(src, index0)
 			cv1 := cv0 >> 8
-			h0 := hash8(cv0, betterLongTableBits)
-			off := index0 + e.cur
+			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
 			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-			e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			e.markLongShardDirty(h0)
+			h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
+			e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			e.markShortShardDirty(h1)
 			index0 += 2
+			off += 2
 		}
 
 		cv = load6432(src, s)
@@ -470,15 +1062,17 @@ encodeLoop:
 			}
 
 			// Store this, since we have it.
-			nextHashS := hash5(cv, betterShortTableBits)
-			nextHashL := hash8(cv, betterLongTableBits)
+			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
 
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
 			l := 4 + e.matchlen(s+4, o2+4, src)
 
 			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.markLongShardDirty(nextHashL)
 			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShortShardDirty(nextHashS)
 			seq.matchLen = uint32(l) - zstdMinMatch
 			seq.litLen = 0
 
@@ -507,20 +1101,21 @@ encodeLoop:
 	}
 	blk.recentOffsets[0] = uint32(offset1)
 	blk.recentOffsets[1] = uint32(offset2)
-	if debug {
+	if debugEncoder {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 }
 
-// EncodeNoHist will encode a block with no history and no following blocks.
-// Most notable difference is that src will not be copied for history and
-// we do not need to check for max match length.
-func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
-	e.Encode(blk, src)
+// ResetDict will reset and set a dictionary if not nil
+func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d != nil {
+		panic("betterFastEncoder: Reset with dict")
+	}
 }
 
 // ResetDict will reset and set a dictionary if not nil
-func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
+func (e *betterFastEncoderDict) Reset(d *dict, singleBlock bool) {
 	e.resetBase(d, singleBlock)
 	if d == nil {
 		return
@@ -535,10 +1130,10 @@ func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
 			const hashLog = betterShortTableBits
 
 			cv := load6432(d.content, i-e.maxMatchOff)
-			nextHash := hash5(cv, hashLog)      // 0 -> 4
-			nextHash1 := hash5(cv>>8, hashLog)  // 1 -> 5
-			nextHash2 := hash5(cv>>16, hashLog) // 2 -> 6
-			nextHash3 := hash5(cv>>24, hashLog) // 3 -> 7
+			nextHash := hashLen(cv, hashLog, betterShortLen)      // 0 -> 4
+			nextHash1 := hashLen(cv>>8, hashLog, betterShortLen)  // 1 -> 5
+			nextHash2 := hashLen(cv>>16, hashLog, betterShortLen) // 2 -> 6
+			nextHash3 := hashLen(cv>>24, hashLog, betterShortLen) // 3 -> 7
 			e.dictTable[nextHash] = tableEntry{
 				val:    uint32(cv),
 				offset: i,
@@ -557,6 +1152,7 @@ func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
 			}
 		}
 		e.lastDictID = d.id
+		e.allDirty = true
 	}
 
 	// Init or copy dict table
@@ -566,7 +1162,7 @@ func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
 		}
 		if len(d.content) >= 8 {
 			cv := load6432(d.content, 0)
-			h := hash8(cv, betterLongTableBits)
+			h := hashLen(cv, betterLongTableBits, betterLongLen)
 			e.dictLongTable[h] = prevEntry{
 				offset: e.maxMatchOff,
 				prev:   e.dictLongTable[h].offset,
@@ -576,7 +1172,7 @@ func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
 			off := 8 // First to read
 			for i := e.maxMatchOff + 1; i < end; i++ {
 				cv = cv>>8 | (uint64(d.content[off]) << 56)
-				h := hash8(cv, betterLongTableBits)
+				h := hashLen(cv, betterLongTableBits, betterLongLen)
 				e.dictLongTable[h] = prevEntry{
 					offset: i,
 					prev:   e.dictLongTable[h].offset,
@@ -585,11 +1181,72 @@ func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
 			}
 		}
 		e.lastDictID = d.id
+		e.allDirty = true
 	}
+
 	// Reset table to initial state
-	copy(e.longTable[:], e.dictLongTable)
+	{
+		dirtyShardCnt := 0
+		if !e.allDirty {
+			for i := range e.shortTableShardDirty {
+				if e.shortTableShardDirty[i] {
+					dirtyShardCnt++
+				}
+			}
+		}
+		const shardCnt = betterShortTableShardCnt
+		const shardSize = betterShortTableShardSize
+		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+			copy(e.table[:], e.dictTable)
+			for i := range e.shortTableShardDirty {
+				e.shortTableShardDirty[i] = false
+			}
+		} else {
+			for i := range e.shortTableShardDirty {
+				if !e.shortTableShardDirty[i] {
+					continue
+				}
+
+				copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+				e.shortTableShardDirty[i] = false
+			}
+		}
+	}
+	{
+		dirtyShardCnt := 0
+		if !e.allDirty {
+			for i := range e.shortTableShardDirty {
+				if e.shortTableShardDirty[i] {
+					dirtyShardCnt++
+				}
+			}
+		}
+		const shardCnt = betterLongTableShardCnt
+		const shardSize = betterLongTableShardSize
+		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+			copy(e.longTable[:], e.dictLongTable)
+			for i := range e.longTableShardDirty {
+				e.longTableShardDirty[i] = false
+			}
+		} else {
+			for i := range e.longTableShardDirty {
+				if !e.longTableShardDirty[i] {
+					continue
+				}
 
+				copy(e.longTable[i*shardSize:(i+1)*shardSize], e.dictLongTable[i*shardSize:(i+1)*shardSize])
+				e.longTableShardDirty[i] = false
+			}
+		}
+	}
 	e.cur = e.maxMatchOff
-	// Reset table to initial state
-	copy(e.table[:], e.dictTable)
+	e.allDirty = false
+}
+
+func (e *betterFastEncoderDict) markLongShardDirty(entryNum uint32) {
+	e.longTableShardDirty[entryNum/betterLongTableShardSize] = true
+}
+
+func (e *betterFastEncoderDict) markShortShardDirty(entryNum uint32) {
+	e.shortTableShardDirty[entryNum/betterShortTableShardSize] = true
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index 19eebf66e5..d36be7bd8c 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -10,16 +10,28 @@ const (
 	dFastLongTableBits = 17                      // Bits used in the long match table
 	dFastLongTableSize = 1 << dFastLongTableBits // Size of the table
 	dFastLongTableMask = dFastLongTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
+	dFastLongLen       = 8                       // Bytes used for table hash
+
+	dLongTableShardCnt  = 1 << (dFastLongTableBits - dictShardBits) // Number of shards in the table
+	dLongTableShardSize = dFastLongTableSize / tableShardCnt        // Size of an individual shard
 
 	dFastShortTableBits = tableBits                // Bits used in the short match table
 	dFastShortTableSize = 1 << dFastShortTableBits // Size of the table
 	dFastShortTableMask = dFastShortTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
+	dFastShortLen       = 5                        // Bytes used for table hash
+
 )
 
 type doubleFastEncoder struct {
 	fastEncoder
-	longTable     [dFastLongTableSize]tableEntry
-	dictLongTable []tableEntry
+	longTable [dFastLongTableSize]tableEntry
+}
+
+type doubleFastEncoderDict struct {
+	fastEncoderDict
+	longTable           [dFastLongTableSize]tableEntry
+	dictLongTable       []tableEntry
+	longTableShardDirty [dLongTableShardCnt]bool
 }
 
 // Encode mimmics functionality in zstd_dfast.c
@@ -32,14 +44,10 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = tableEntry{}
-			}
-			for i := range e.longTable[:] {
-				e.longTable[i] = tableEntry{}
-			}
+			e.table = [dFastShortTableSize]tableEntry{}
+			e.longTable = [dFastLongTableSize]tableEntry{}
 			e.cur = e.maxMatchOff
 			break
 		}
@@ -100,7 +108,7 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
 	}
 
@@ -115,8 +123,8 @@ encodeLoop:
 				panic("offset0 was 0")
 			}
 
-			nextHashS := hash5(cv, dFastShortTableBits)
-			nextHashL := hash8(cv, dFastLongTableBits)
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
 
@@ -130,9 +138,9 @@ encodeLoop:
 				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 					// Consider history as well.
 					var seq seq
-					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 
-					seq.matchLen = uint32(lenght - zstdMinMatch)
+					seq.matchLen = uint32(length - zstdMinMatch)
 
 					// We might be able to match backwards.
 					// Extend as long as we can.
@@ -158,11 +166,11 @@ encodeLoop:
 						println("repeat sequence", seq, "next s:", s)
 					}
 					blk.sequences = append(blk.sequences, seq)
-					s += lenght + repOff
+					s += length + repOff
 					nextEmit = s
 					if s >= sLimit {
-						if debug {
-							println("repeat ended", s, lenght)
+						if debugEncoder {
+							println("repeat ended", s, length)
 
 						}
 						break encodeLoop
@@ -199,7 +207,7 @@ encodeLoop:
 				// See if we can find a long match at s+1
 				const checkAt = 1
 				cv := load6432(src, s+checkAt)
-				nextHashL = hash8(cv, dFastLongTableBits)
+				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
 				candidateL = e.longTable[nextHashL]
 				coffsetL = s - (candidateL.offset - e.cur) + checkAt
 
@@ -295,16 +303,16 @@ encodeLoop:
 		cv1 := load6432(src, index1)
 		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
 		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
-		e.longTable[hash8(cv0, dFastLongTableBits)] = te0
-		e.longTable[hash8(cv1, dFastLongTableBits)] = te1
+		e.longTable[hashLen(cv0, dFastLongTableBits, dFastLongLen)] = te0
+		e.longTable[hashLen(cv1, dFastLongTableBits, dFastLongLen)] = te1
 		cv0 >>= 8
 		cv1 >>= 8
 		te0.offset++
 		te1.offset++
 		te0.val = uint32(cv0)
 		te1.val = uint32(cv1)
-		e.table[hash5(cv0, dFastShortTableBits)] = te0
-		e.table[hash5(cv1, dFastShortTableBits)] = te1
+		e.table[hashLen(cv0, dFastShortTableBits, dFastShortLen)] = te0
+		e.table[hashLen(cv1, dFastShortTableBits, dFastShortLen)] = te1
 
 		cv = load6432(src, s)
 
@@ -321,8 +329,8 @@ encodeLoop:
 			}
 
 			// Store this, since we have it.
-			nextHashS := hash5(cv, dFastShortTableBits)
-			nextHashL := hash8(cv, dFastLongTableBits)
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
@@ -359,7 +367,7 @@ encodeLoop:
 	}
 	blk.recentOffsets[0] = uint32(offset1)
 	blk.recentOffsets[1] = uint32(offset2)
-	if debug {
+	if debugEncoder {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 }
@@ -376,7 +384,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 	)
 
 	// Protect against e.cur wraparound.
-	if e.cur >= bufferReset {
+	if e.cur >= e.bufferReset {
 		for i := range e.table[:] {
 			e.table[i] = tableEntry{}
 		}
@@ -418,7 +426,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
 	}
 
@@ -427,8 +435,8 @@ encodeLoop:
 		var t int32
 		for {
 
-			nextHashS := hash5(cv, dFastShortTableBits)
-			nextHashL := hash8(cv, dFastLongTableBits)
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
 
@@ -474,7 +482,7 @@ encodeLoop:
 					s += length + repOff
 					nextEmit = s
 					if s >= sLimit {
-						if debug {
+						if debugEncoder {
 							println("repeat ended", s, length)
 
 						}
@@ -512,7 +520,7 @@ encodeLoop:
 				// See if we can find a long match at s+1
 				const checkAt = 1
 				cv := load6432(src, s+checkAt)
-				nextHashL = hash8(cv, dFastLongTableBits)
+				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
 				candidateL = e.longTable[nextHashL]
 				coffsetL = s - (candidateL.offset - e.cur) + checkAt
 
@@ -605,16 +613,16 @@ encodeLoop:
 		cv1 := load6432(src, index1)
 		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
 		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
-		e.longTable[hash8(cv0, dFastLongTableBits)] = te0
-		e.longTable[hash8(cv1, dFastLongTableBits)] = te1
+		e.longTable[hashLen(cv0, dFastLongTableBits, dFastLongLen)] = te0
+		e.longTable[hashLen(cv1, dFastLongTableBits, dFastLongLen)] = te1
 		cv0 >>= 8
 		cv1 >>= 8
 		te0.offset++
 		te1.offset++
 		te0.val = uint32(cv0)
 		te1.val = uint32(cv1)
-		e.table[hash5(cv0, dFastShortTableBits)] = te0
-		e.table[hash5(cv1, dFastShortTableBits)] = te1
+		e.table[hashLen(cv0, dFastShortTableBits, dFastShortLen)] = te0
+		e.table[hashLen(cv1, dFastShortTableBits, dFastShortLen)] = te1
 
 		cv = load6432(src, s)
 
@@ -631,8 +639,8 @@ encodeLoop:
 			}
 
 			// Store this, since we have it.
-			nextHashS := hash5(cv1>>8, dFastShortTableBits)
-			nextHashL := hash8(cv, dFastLongTableBits)
+			nextHashS := hashLen(cv1>>8, dFastShortTableBits, dFastShortLen)
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
@@ -668,19 +676,389 @@ encodeLoop:
 		blk.literals = append(blk.literals, src[nextEmit:]...)
 		blk.extraLits = len(src) - int(nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 
 	// We do not store history, so we must offset e.cur to avoid false matches for next user.
-	if e.cur < bufferReset {
+	if e.cur < e.bufferReset {
 		e.cur += int32(len(src))
 	}
 }
 
+// Encode will encode the content, with a dictionary if initialized for it.
+func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = tableEntry{}
+			}
+			e.markAllShardsDirty()
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.longTable[i].offset = v
+		}
+		e.markAllShardsDirty()
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 8
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debugEncoder {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = entry
+			e.markShardDirty(nextHashS)
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(length - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+					s += length + repOff
+					nextEmit = s
+					if s >= sLimit {
+						if debugEncoder {
+							println("repeat ended", s, length)
+
+						}
+						break encodeLoop
+					}
+					cv = load6432(src, s)
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := s - (candidateL.offset - e.cur)
+			coffsetS := s - (candidateS.offset - e.cur)
+
+			// Check if we have a long match.
+			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+				// Found a long match, likely at least 8 bytes.
+				// Reference encoder checks all 8 bytes, we only check 4,
+				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+				t = candidateL.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			// Check if we have a short match.
+			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = s - (candidateL.offset - e.cur) + checkAt
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
+				e.markLongShardDirty(nextHashL)
+				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+					// Found a long match, likely at least 8 bytes.
+					// Reference encoder checks all 8 bytes, we only check 4,
+					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+					t = candidateL.offset - e.cur
+					s += checkAt
+					if debugMatches {
+						println("long match (after short)")
+					}
+					break
+				}
+
+				t = candidateS.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlen(s+4, t+4, src) + 4
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) and start+2 (short)
+		index0 := s - l + 1
+		// Index match end-2 (long) and end-1 (short)
+		index1 := s - 2
+
+		cv0 := load6432(src, index0)
+		cv1 := load6432(src, index1)
+		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
+		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
+		longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
+		longHash2 := hashLen(cv1, dFastLongTableBits, dFastLongLen)
+		e.longTable[longHash1] = te0
+		e.longTable[longHash2] = te1
+		e.markLongShardDirty(longHash1)
+		e.markLongShardDirty(longHash2)
+		cv0 >>= 8
+		cv1 >>= 8
+		te0.offset++
+		te1.offset++
+		te0.val = uint32(cv0)
+		te1.val = uint32(cv1)
+		hashVal1 := hashLen(cv0, dFastShortTableBits, dFastShortLen)
+		hashVal2 := hashLen(cv1, dFastShortTableBits, dFastShortLen)
+		e.table[hashVal1] = te0
+		e.markShardDirty(hashVal1)
+		e.table[hashVal2] = te1
+		e.markShardDirty(hashVal2)
+
+		cv = load6432(src, s)
+
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = entry
+			e.markShardDirty(nextHashS)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debugEncoder {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+	// If we encoded more than 64K mark all dirty.
+	if len(src) > 64<<10 {
+		e.markAllShardsDirty()
+	}
+}
+
 // ResetDict will reset and set a dictionary if not nil
 func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) {
 	e.fastEncoder.Reset(d, singleBlock)
+	if d != nil {
+		panic("doubleFastEncoder: Reset with dict not supported")
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
+	allDirty := e.allDirty
+	e.fastEncoderDict.Reset(d, singleBlock)
 	if d == nil {
 		return
 	}
@@ -692,22 +1070,54 @@ func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) {
 		}
 		if len(d.content) >= 8 {
 			cv := load6432(d.content, 0)
-			e.dictLongTable[hash8(cv, dFastLongTableBits)] = tableEntry{
+			e.dictLongTable[hashLen(cv, dFastLongTableBits, dFastLongLen)] = tableEntry{
 				val:    uint32(cv),
 				offset: e.maxMatchOff,
 			}
 			end := int32(len(d.content)) - 8 + e.maxMatchOff
 			for i := e.maxMatchOff + 1; i < end; i++ {
 				cv = cv>>8 | (uint64(d.content[i-e.maxMatchOff+7]) << 56)
-				e.dictLongTable[hash8(cv, dFastLongTableBits)] = tableEntry{
+				e.dictLongTable[hashLen(cv, dFastLongTableBits, dFastLongLen)] = tableEntry{
 					val:    uint32(cv),
 					offset: i,
 				}
 			}
 		}
 		e.lastDictID = d.id
+		allDirty = true
 	}
 	// Reset table to initial state
 	e.cur = e.maxMatchOff
-	copy(e.longTable[:], e.dictLongTable)
+
+	dirtyShardCnt := 0
+	if !allDirty {
+		for i := range e.longTableShardDirty {
+			if e.longTableShardDirty[i] {
+				dirtyShardCnt++
+			}
+		}
+	}
+
+	if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
+		//copy(e.longTable[:], e.dictLongTable)
+		e.longTable = *(*[dFastLongTableSize]tableEntry)(e.dictLongTable)
+		for i := range e.longTableShardDirty {
+			e.longTableShardDirty[i] = false
+		}
+		return
+	}
+	for i := range e.longTableShardDirty {
+		if !e.longTableShardDirty[i] {
+			continue
+		}
+
+		// copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
+		*(*[dLongTableShardSize]tableEntry)(e.longTable[i*dLongTableShardSize:]) = *(*[dLongTableShardSize]tableEntry)(e.dictLongTable[i*dLongTableShardSize:])
+
+		e.longTableShardDirty[i] = false
+	}
+}
+
+func (e *doubleFastEncoderDict) markLongShardDirty(entryNum uint32) {
+	e.longTableShardDirty[entryNum/dLongTableShardSize] = true
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index 0b301df439..f45a3da7da 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -6,15 +6,16 @@ package zstd
 
 import (
 	"fmt"
-	"math"
-	"math/bits"
 )
 
 const (
-	tableBits      = 15             // Bits used in the table
-	tableSize      = 1 << tableBits // Size of the table
-	tableMask      = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
-	maxMatchLength = 131074
+	tableBits        = 15                               // Bits used in the table
+	tableSize        = 1 << tableBits                   // Size of the table
+	tableShardCnt    = 1 << (tableBits - dictShardBits) // Number of shards in the table
+	tableShardSize   = tableSize / tableShardCnt        // Size of an individual shard
+	tableFastHashLen = 6
+	tableMask        = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks.
+	maxMatchLength   = 131074
 )
 
 type tableEntry struct {
@@ -24,8 +25,14 @@ type tableEntry struct {
 
 type fastEncoder struct {
 	fastBase
-	table     [tableSize]tableEntry
-	dictTable []tableEntry
+	table [tableSize]tableEntry
+}
+
+type fastEncoderDict struct {
+	fastEncoder
+	dictTable       []tableEntry
+	tableShardDirty [tableShardCnt]bool
+	allDirty        bool
 }
 
 // Encode mimmics functionality in zstd_fast.c
@@ -36,7 +43,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
@@ -78,7 +85,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 	// TEMPLATE
 	const hashLog = tableBits
 	// seems global, but would be nice to tweak.
-	const kSearchStrength = 8
+	const kSearchStrength = 6
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
@@ -95,7 +102,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
 	}
 
@@ -114,8 +121,8 @@ encodeLoop:
 				panic("offset0 was 0")
 			}
 
-			nextHash := hash6(cv, hashLog)
-			nextHash2 := hash6(cv>>8, hashLog)
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
+			nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
 			candidate := e.table[nextHash]
 			candidate2 := e.table[nextHash2]
 			repIndex := s - offset1 + 2
@@ -126,21 +133,7 @@ encodeLoop:
 			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				var length int32
-				// length = 4 + e.matchlen(s+6, repIndex+4, src)
-				{
-					a := src[s+6:]
-					b := src[repIndex+4:]
-					endI := len(a) & (math.MaxInt32 - 7)
-					length = int32(endI) + 4
-					for i := 0; i < endI; i += 8 {
-						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-							break
-						}
-					}
-				}
-
+				length := 4 + e.matchlen(s+6, repIndex+4, src)
 				seq.matchLen = uint32(length - zstdMinMatch)
 
 				// We might be able to match backwards.
@@ -170,7 +163,7 @@ encodeLoop:
 				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
-					if debug {
+					if debugEncoder {
 						println("repeat ended", s, length)
 
 					}
@@ -227,20 +220,7 @@ encodeLoop:
 		}
 
 		// Extend the 4-byte match as long as possible.
-		//l := e.matchlen(s+4, t+4, src) + 4
-		var l int32
-		{
-			a := src[s+4:]
-			b := src[t+4:]
-			endI := len(a) & (math.MaxInt32 - 7)
-			l = int32(endI) + 4
-			for i := 0; i < endI; i += 8 {
-				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-					break
-				}
-			}
-		}
+		l := e.matchlen(s+4, t+4, src) + 4
 
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -277,23 +257,10 @@ encodeLoop:
 		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
-			//l := 4 + e.matchlen(s+4, o2+4, src)
-			var l int32
-			{
-				a := src[s+4:]
-				b := src[o2+4:]
-				endI := len(a) & (math.MaxInt32 - 7)
-				l = int32(endI) + 4
-				for i := 0; i < endI; i += 8 {
-					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-						break
-					}
-				}
-			}
+			l := 4 + e.matchlen(s+4, o2+4, src)
 
 			// Store this, since we have it.
-			nextHash := hash6(cv, hashLog)
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
 			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
 			seq.matchLen = uint32(l) - zstdMinMatch
 			seq.litLen = 0
@@ -322,7 +289,7 @@ encodeLoop:
 	}
 	blk.recentOffsets[0] = uint32(offset1)
 	blk.recentOffsets[1] = uint32(offset2)
-	if debug {
+	if debugEncoder {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 }
@@ -335,14 +302,14 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 		inputMargin            = 8
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
-	if debug {
-		if len(src) > maxBlockSize {
+	if debugEncoder {
+		if len(src) > maxCompressedBlockSize {
 			panic("src too big")
 		}
 	}
 
 	// Protect against e.cur wraparound.
-	if e.cur >= bufferReset {
+	if e.cur >= e.bufferReset {
 		for i := range e.table[:] {
 			e.table[i] = tableEntry{}
 		}
@@ -366,7 +333,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 	// TEMPLATE
 	const hashLog = tableBits
 	// seems global, but would be nice to tweak.
-	const kSearchStrength = 8
+	const kSearchStrength = 6
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
@@ -383,7 +350,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
 	}
 
@@ -397,8 +364,8 @@ encodeLoop:
 		// By not using them for the first 3 matches
 
 		for {
-			nextHash := hash6(cv, hashLog)
-			nextHash2 := hash6(cv>>8, hashLog)
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
+			nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
 			candidate := e.table[nextHash]
 			candidate2 := e.table[nextHash2]
 			repIndex := s - offset1 + 2
@@ -409,21 +376,7 @@ encodeLoop:
 			if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				// length := 4 + e.matchlen(s+6, repIndex+4, src)
-				// length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
-				var length int32
-				{
-					a := src[s+6:]
-					b := src[repIndex+4:]
-					endI := len(a) & (math.MaxInt32 - 7)
-					length = int32(endI) + 4
-					for i := 0; i < endI; i += 8 {
-						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-							break
-						}
-					}
-				}
+				length := 4 + e.matchlen(s+6, repIndex+4, src)
 
 				seq.matchLen = uint32(length - zstdMinMatch)
 
@@ -454,7 +407,7 @@ encodeLoop:
 				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
-					if debug {
+					if debugEncoder {
 						println("repeat ended", s, length)
 
 					}
@@ -513,21 +466,7 @@ encodeLoop:
 			panic(fmt.Sprintf("t (%d) < 0 ", t))
 		}
 		// Extend the 4-byte match as long as possible.
-		//l := e.matchlenNoHist(s+4, t+4, src) + 4
-		// l := int32(matchLen(src[s+4:], src[t+4:])) + 4
-		var l int32
-		{
-			a := src[s+4:]
-			b := src[t+4:]
-			endI := len(a) & (math.MaxInt32 - 7)
-			l = int32(endI) + 4
-			for i := 0; i < endI; i += 8 {
-				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-					break
-				}
-			}
-		}
+		l := e.matchlen(s+4, t+4, src) + 4
 
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -564,24 +503,10 @@ encodeLoop:
 		if o2 := s - offset2; len(blk.sequences) > 2 && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
-			//l := 4 + e.matchlenNoHist(s+4, o2+4, src)
-			// l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
-			var l int32
-			{
-				a := src[s+4:]
-				b := src[o2+4:]
-				endI := len(a) & (math.MaxInt32 - 7)
-				l = int32(endI) + 4
-				for i := 0; i < endI; i += 8 {
-					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-						break
-					}
-				}
-			}
+			l := 4 + e.matchlen(s+4, o2+4, src)
 
 			// Store this, since we have it.
-			nextHash := hash6(cv, hashLog)
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
 			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
 			seq.matchLen = uint32(l) - zstdMinMatch
 			seq.litLen = 0
@@ -608,17 +533,290 @@ encodeLoop:
 		blk.literals = append(blk.literals, src[nextEmit:]...)
 		blk.extraLits = len(src) - int(nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 	// We do not store history, so we must offset e.cur to avoid false matches for next user.
-	if e.cur < bufferReset {
+	if e.cur < e.bufferReset {
 		e.cur += int32(len(src))
 	}
 }
 
+// Encode will encode the content, with a dictionary if initialized for it.
+func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		inputMargin            = 8
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if e.allDirty || len(src) > 32<<10 {
+		e.fastEncoder.Encode(blk, src)
+		e.allDirty = true
+		return
+	}
+	// Protect against e.cur wraparound.
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
+		if len(e.hist) == 0 {
+			e.table = [tableSize]tableEntry{}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 2.
+	const stepSize = 2
+
+	// TEMPLATE
+	const hashLog = tableBits
+	// seems global, but would be nice to tweak.
+	const kSearchStrength = 7
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debugEncoder {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		// t will contain the match offset when we find one.
+		// When existing the search loop, we have already checked 4 bytes.
+		var t int32
+
+		// We will not use repeat offsets across blocks.
+		// By not using them for the first 3 matches
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
+			nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
+			candidate := e.table[nextHash]
+			candidate2 := e.table[nextHash2]
+			repIndex := s - offset1 + 2
+
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShardDirty(nextHash)
+			e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)}
+			e.markShardDirty(nextHash2)
+
+			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
+				// Consider history as well.
+				var seq seq
+				length := 4 + e.matchlen(s+6, repIndex+4, src)
+
+				seq.matchLen = uint32(length - zstdMinMatch)
+
+				// We might be able to match backwards.
+				// Extend as long as we can.
+				start := s + 2
+				// We end the search early, so we don't risk 0 literals
+				// and have to do special offset treatment.
+				startLimit := nextEmit + 1
+
+				sMin := s - e.maxMatchOff
+				if sMin < 0 {
+					sMin = 0
+				}
+				for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch {
+					repIndex--
+					start--
+					seq.matchLen++
+				}
+				addLiterals(&seq, start)
+
+				// rep 0
+				seq.offset = 1
+				if debugSequences {
+					println("repeat sequence", seq, "next s:", s)
+				}
+				blk.sequences = append(blk.sequences, seq)
+				s += length + 2
+				nextEmit = s
+				if s >= sLimit {
+					if debugEncoder {
+						println("repeat ended", s, length)
+
+					}
+					break encodeLoop
+				}
+				cv = load6432(src, s)
+				continue
+			}
+			coffset0 := s - (candidate.offset - e.cur)
+			coffset1 := s - (candidate2.offset - e.cur) + 1
+			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
+				// found a regular match
+				t = candidate.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				break
+			}
+
+			if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val {
+				// found a regular match
+				t = candidate2.offset - e.cur
+				s++
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				break
+			}
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+		// A 4-byte match has been found. We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlen(s+4, t+4, src) + 4
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence.
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		// Don't use repeat offsets
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+		cv = load6432(src, s)
+
+		// Check offset 2
+		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			// Store this, since we have it.
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShardDirty(nextHash)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				break encodeLoop
+			}
+			// Prepare next loop.
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debugEncoder {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
 // ResetDict will reset and set a dictionary if not nil
 func (e *fastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d != nil {
+		panic("fastEncoder: Reset with dict")
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
 	e.resetBase(d, singleBlock)
 	if d == nil {
 		return
@@ -631,13 +829,12 @@ func (e *fastEncoder) Reset(d *dict, singleBlock bool) {
 		}
 		if true {
 			end := e.maxMatchOff + int32(len(d.content)) - 8
-			for i := e.maxMatchOff; i < end; i += 3 {
+			for i := e.maxMatchOff; i < end; i += 2 {
 				const hashLog = tableBits
 
 				cv := load6432(d.content, i-e.maxMatchOff)
-				nextHash := hash6(cv, hashLog)      // 0 -> 5
-				nextHash1 := hash6(cv>>8, hashLog)  // 1 -> 6
-				nextHash2 := hash6(cv>>16, hashLog) // 2 -> 7
+				nextHash := hashLen(cv, hashLog, tableFastHashLen)     // 0 -> 6
+				nextHash1 := hashLen(cv>>8, hashLog, tableFastHashLen) // 1 -> 7
 				e.dictTable[nextHash] = tableEntry{
 					val:    uint32(cv),
 					offset: i,
@@ -646,16 +843,49 @@ func (e *fastEncoder) Reset(d *dict, singleBlock bool) {
 					val:    uint32(cv >> 8),
 					offset: i + 1,
 				}
-				e.dictTable[nextHash2] = tableEntry{
-					val:    uint32(cv >> 16),
-					offset: i + 2,
-				}
 			}
 		}
 		e.lastDictID = d.id
+		e.allDirty = true
 	}
 
 	e.cur = e.maxMatchOff
-	// Reset table to initial state
-	copy(e.table[:], e.dictTable)
+	dirtyShardCnt := 0
+	if !e.allDirty {
+		for i := range e.tableShardDirty {
+			if e.tableShardDirty[i] {
+				dirtyShardCnt++
+			}
+		}
+	}
+
+	const shardCnt = tableShardCnt
+	const shardSize = tableShardSize
+	if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+		//copy(e.table[:], e.dictTable)
+		e.table = *(*[tableSize]tableEntry)(e.dictTable)
+		for i := range e.tableShardDirty {
+			e.tableShardDirty[i] = false
+		}
+		e.allDirty = false
+		return
+	}
+	for i := range e.tableShardDirty {
+		if !e.tableShardDirty[i] {
+			continue
+		}
+
+		//copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+		*(*[shardSize]tableEntry)(e.table[i*shardSize:]) = *(*[shardSize]tableEntry)(e.dictTable[i*shardSize:])
+		e.tableShardDirty[i] = false
+	}
+	e.allDirty = false
+}
+
+func (e *fastEncoderDict) markAllShardsDirty() {
+	e.allDirty = true
+}
+
+func (e *fastEncoderDict) markShardDirty(entryNum uint32) {
+	e.tableShardDirty[entryNum/tableShardSize] = true
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index f5759211da..8f8223cd3a 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -6,8 +6,10 @@ package zstd
 
 import (
 	"crypto/rand"
+	"errors"
 	"fmt"
 	"io"
+	"math"
 	rdebug "runtime/debug"
 	"sync"
 
@@ -33,7 +35,7 @@ type encoder interface {
 	Block() *blockEnc
 	CRC() *xxhash.Digest
 	AppendCRC([]byte) []byte
-	WindowSize(size int) int32
+	WindowSize(size int64) int32
 	UseBlock(*blockEnc)
 	Reset(d *dict, singleBlock bool)
 }
@@ -48,6 +50,8 @@ type encoderState struct {
 	err              error
 	writeErr         error
 	nWritten         int64
+	nInput           int64
+	frameContentSize int64
 	headerWritten    bool
 	eofWritten       bool
 	fullFrameWritten bool
@@ -96,23 +100,25 @@ func (e *Encoder) Reset(w io.Writer) {
 	if cap(s.filling) == 0 {
 		s.filling = make([]byte, 0, e.o.blockSize)
 	}
-	if cap(s.current) == 0 {
-		s.current = make([]byte, 0, e.o.blockSize)
-	}
-	if cap(s.previous) == 0 {
-		s.previous = make([]byte, 0, e.o.blockSize)
+	if e.o.concurrent > 1 {
+		if cap(s.current) == 0 {
+			s.current = make([]byte, 0, e.o.blockSize)
+		}
+		if cap(s.previous) == 0 {
+			s.previous = make([]byte, 0, e.o.blockSize)
+		}
+		s.current = s.current[:0]
+		s.previous = s.previous[:0]
+		if s.writing == nil {
+			s.writing = &blockEnc{lowMem: e.o.lowMem}
+			s.writing.init()
+		}
+		s.writing.initNewEncode()
 	}
 	if s.encoder == nil {
 		s.encoder = e.o.encoder()
 	}
-	if s.writing == nil {
-		s.writing = &blockEnc{}
-		s.writing.init()
-	}
-	s.writing.initNewEncode()
 	s.filling = s.filling[:0]
-	s.current = s.current[:0]
-	s.previous = s.previous[:0]
 	s.encoder.Reset(e.o.dict, false)
 	s.headerWritten = false
 	s.eofWritten = false
@@ -120,7 +126,21 @@ func (e *Encoder) Reset(w io.Writer) {
 	s.w = w
 	s.err = nil
 	s.nWritten = 0
+	s.nInput = 0
 	s.writeErr = nil
+	s.frameContentSize = 0
+}
+
+// ResetContentSize will reset and set a content size for the next stream.
+// If the bytes written does not match the size given an error will be returned
+// when calling Close().
+// This is removed when Reset is called.
+// Sizes <= 0 results in no content size set.
+func (e *Encoder) ResetContentSize(w io.Writer, size int64) {
+	e.Reset(w)
+	if size >= 0 {
+		e.state.frameContentSize = size
+	}
 }
 
 // Write data to the encoder.
@@ -130,6 +150,9 @@ func (e *Encoder) Reset(w io.Writer) {
 // and write CRC if requested.
 func (e *Encoder) Write(p []byte) (n int, err error) {
 	s := &e.state
+	if s.eofWritten {
+		return 0, ErrEncoderClosed
+	}
 	for len(p) > 0 {
 		if len(p)+len(s.filling) < e.o.blockSize {
 			if e.o.crc {
@@ -176,14 +199,21 @@ func (e *Encoder) nextBlock(final bool) error {
 	}
 	if !s.headerWritten {
 		// If we have a single block encode, do a sync compression.
+		if final && len(s.filling) == 0 && !e.o.fullZero {
+			s.headerWritten = true
+			s.fullFrameWritten = true
+			s.eofWritten = true
+			return nil
+		}
 		if final && len(s.filling) > 0 {
-			s.current = e.EncodeAll(s.filling, s.current[:0])
+			s.current = e.encodeAll(s.encoder, s.filling, s.current[:0])
 			var n2 int
 			n2, s.err = s.w.Write(s.current)
 			if s.err != nil {
 				return s.err
 			}
 			s.nWritten += int64(n2)
+			s.nInput += int64(len(s.filling))
 			s.current = s.current[:0]
 			s.filling = s.filling[:0]
 			s.headerWritten = true
@@ -194,17 +224,14 @@ func (e *Encoder) nextBlock(final bool) error {
 
 		var tmp [maxHeaderSize]byte
 		fh := frameHeader{
-			ContentSize:   0,
-			WindowSize:    uint32(s.encoder.WindowSize(0)),
+			ContentSize:   uint64(s.frameContentSize),
+			WindowSize:    uint32(s.encoder.WindowSize(s.frameContentSize)),
 			SingleSegment: false,
 			Checksum:      e.o.crc,
 			DictID:        e.o.dict.ID(),
 		}
 
-		dst, err := fh.appendTo(tmp[:0])
-		if err != nil {
-			return err
-		}
+		dst := fh.appendTo(tmp[:0])
 		s.headerWritten = true
 		s.wWg.Wait()
 		var n2 int
@@ -235,11 +262,41 @@ func (e *Encoder) nextBlock(final bool) error {
 		return s.err
 	}
 
+	// SYNC:
+	if e.o.concurrent == 1 {
+		src := s.filling
+		s.nInput += int64(len(s.filling))
+		if debugEncoder {
+			println("Adding sync block,", len(src), "bytes, final:", final)
+		}
+		enc := s.encoder
+		blk := enc.Block()
+		blk.reset(nil)
+		enc.Encode(blk, src)
+		blk.last = final
+		if final {
+			s.eofWritten = true
+		}
+
+		s.err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
+		if s.err != nil {
+			return s.err
+		}
+		_, s.err = s.w.Write(blk.output)
+		s.nWritten += int64(len(blk.output))
+		s.filling = s.filling[:0]
+		return s.err
+	}
+
 	// Move blocks forward.
 	s.filling, s.current, s.previous = s.previous[:0], s.filling, s.current
+	s.nInput += int64(len(s.current))
 	s.wg.Add(1)
+	if final {
+		s.eofWritten = true
+	}
 	go func(src []byte) {
-		if debug {
+		if debugEncoder {
 			println("Adding block,", len(src), "bytes, final:", final)
 		}
 		defer func() {
@@ -253,9 +310,6 @@ func (e *Encoder) nextBlock(final bool) error {
 		blk := enc.Block()
 		enc.Encode(blk, src)
 		blk.last = final
-		if final {
-			s.eofWritten = true
-		}
 		// Wait for pending writes.
 		s.wWg.Wait()
 		if s.writeErr != nil {
@@ -276,22 +330,8 @@ func (e *Encoder) nextBlock(final bool) error {
 				}
 				s.wWg.Done()
 			}()
-			err := errIncompressible
-			// If we got the exact same number of literals as input,
-			// assume the literals cannot be compressed.
-			if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
-				err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
-			}
-			switch err {
-			case errIncompressible:
-				if debug {
-					println("Storing incompressible block as raw")
-				}
-				blk.encodeRaw(src)
-				// In fast mode, we do not transfer offsets, so we don't have to deal with changing the.
-			case nil:
-			default:
-				s.writeErr = err
+			s.writeErr = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
+			if s.writeErr != nil {
 				return
 			}
 			_, s.writeErr = s.w.Write(blk.output)
@@ -307,7 +347,7 @@ func (e *Encoder) nextBlock(final bool) error {
 //
 // The Copy function uses ReaderFrom if available.
 func (e *Encoder) ReadFrom(r io.Reader) (n int64, err error) {
-	if debug {
+	if debugEncoder {
 		println("Using ReadFrom")
 	}
 
@@ -330,20 +370,20 @@ func (e *Encoder) ReadFrom(r io.Reader) (n int64, err error) {
 		switch err {
 		case io.EOF:
 			e.state.filling = e.state.filling[:len(e.state.filling)-len(src)]
-			if debug {
+			if debugEncoder {
 				println("ReadFrom: got EOF final block:", len(e.state.filling))
 			}
 			return n, nil
+		case nil:
 		default:
-			if debug {
+			if debugEncoder {
 				println("ReadFrom: got error:", err)
 			}
 			e.state.err = err
 			return n, err
-		case nil:
 		}
 		if len(src) > 0 {
-			if debug {
+			if debugEncoder {
 				println("ReadFrom: got space left in source:", len(src))
 			}
 			continue
@@ -365,12 +405,20 @@ func (e *Encoder) Flush() error {
 	if len(s.filling) > 0 {
 		err := e.nextBlock(false)
 		if err != nil {
+			// Ignore Flush after Close.
+			if errors.Is(s.err, ErrEncoderClosed) {
+				return nil
+			}
 			return err
 		}
 	}
 	s.wg.Wait()
 	s.wWg.Wait()
 	if s.err != nil {
+		// Ignore Flush after Close.
+		if errors.Is(s.err, ErrEncoderClosed) {
+			return nil
+		}
 		return s.err
 	}
 	return s.writeErr
@@ -386,8 +434,16 @@ func (e *Encoder) Close() error {
 	}
 	err := e.nextBlock(true)
 	if err != nil {
+		if errors.Is(s.err, ErrEncoderClosed) {
+			return nil
+		}
 		return err
 	}
+	if s.frameContentSize > 0 {
+		if s.nInput != s.frameContentSize {
+			return fmt.Errorf("frame content size %d given, but %d bytes was written", s.frameContentSize, s.nInput)
+		}
+	}
 	if e.state.fullFrameWritten {
 		return s.err
 	}
@@ -418,6 +474,11 @@ func (e *Encoder) Close() error {
 		}
 		_, s.err = s.w.Write(frame)
 	}
+	if s.err == nil {
+		s.err = ErrEncoderClosed
+		return nil
+	}
+
 	return s.err
 }
 
@@ -428,6 +489,15 @@ func (e *Encoder) Close() error {
 // Data compressed with EncodeAll can be decoded with the Decoder,
 // using either a stream or DecodeAll.
 func (e *Encoder) EncodeAll(src, dst []byte) []byte {
+	e.init.Do(e.initialize)
+	enc := <-e.encoders
+	defer func() {
+		e.encoders <- enc
+	}()
+	return e.encodeAll(enc, src, dst)
+}
+
+func (e *Encoder) encodeAll(enc encoder, src, dst []byte) []byte {
 	if len(src) == 0 {
 		if e.o.fullZero {
 			// Add frame header.
@@ -439,7 +509,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 				Checksum: false,
 				DictID:   0,
 			}
-			dst, _ = fh.appendTo(dst)
+			dst = fh.appendTo(dst)
 
 			// Write raw block as last one only.
 			var blk blockHeader
@@ -450,37 +520,28 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		}
 		return dst
 	}
-	e.init.Do(e.initialize)
-	enc := <-e.encoders
-	defer func() {
-		// Release encoder reference to last block.
-		// If a non-single block is needed the encoder will reset again.
-		e.encoders <- enc
-	}()
-	// Use single segments when above minimum window and below 1MB.
-	single := len(src) < 1<<20 && len(src) > MinWindowSize
+
+	// Use single segments when above minimum window and below window size.
+	single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
 	if e.o.single != nil {
 		single = *e.o.single
 	}
 	fh := frameHeader{
 		ContentSize:   uint64(len(src)),
-		WindowSize:    uint32(enc.WindowSize(len(src))),
+		WindowSize:    uint32(enc.WindowSize(int64(len(src)))),
 		SingleSegment: single,
 		Checksum:      e.o.crc,
 		DictID:        e.o.dict.ID(),
 	}
 
 	// If less than 1MB, allocate a buffer up front.
-	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 {
+	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 && !e.o.lowMem {
 		dst = make([]byte, 0, len(src))
 	}
-	dst, err := fh.appendTo(dst)
-	if err != nil {
-		panic(err)
-	}
+	dst = fh.appendTo(dst)
 
 	// If we can do everything in one block, prefer that.
-	if len(src) <= maxCompressedBlockSize {
+	if len(src) <= e.o.blockSize {
 		enc.Reset(e.o.dict, true)
 		// Slightly faster with no history and everything in one block.
 		if e.o.crc {
@@ -496,25 +557,15 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 
 		// If we got the exact same number of literals as input,
 		// assume the literals cannot be compressed.
-		err := errIncompressible
 		oldout := blk.output
-		if len(blk.literals) != len(src) || len(src) != e.o.blockSize {
-			// Output directly to dst
-			blk.output = dst
-			err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
-		}
+		// Output directly to dst
+		blk.output = dst
 
-		switch err {
-		case errIncompressible:
-			if debug {
-				println("Storing incompressible block as raw")
-			}
-			dst = blk.encodeRawTo(dst, src)
-		case nil:
-			dst = blk.output
-		default:
+		err := blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
+		if err != nil {
 			panic(err)
 		}
+		dst = blk.output
 		blk.output = oldout
 	} else {
 		enc.Reset(e.o.dict, false)
@@ -533,25 +584,11 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 			if len(src) == 0 {
 				blk.last = true
 			}
-			err := errIncompressible
-			// If we got the exact same number of literals as input,
-			// assume the literals cannot be compressed.
-			if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize {
-				err = blk.encode(todo, e.o.noEntropy, !e.o.allLitEntropy)
-			}
-
-			switch err {
-			case errIncompressible:
-				if debug {
-					println("Storing incompressible block as raw")
-				}
-				dst = blk.encodeRawTo(dst, todo)
-				blk.popOffsets()
-			case nil:
-				dst = append(dst, blk.output...)
-			default:
+			err := blk.encode(todo, e.o.noEntropy, !e.o.allLitEntropy)
+			if err != nil {
 				panic(err)
 			}
+			dst = append(dst, blk.output...)
 			blk.reset(nil)
 		}
 	}
@@ -561,6 +598,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 	// Add padding with content from crypto/rand.Reader
 	if e.o.pad > 0 {
 		add := calcSkippableFrame(int64(len(dst)), int64(e.o.pad))
+		var err error
 		dst, err = skippableFrame(dst, add, rand.Reader)
 		if err != nil {
 			panic(err)
@@ -568,3 +606,37 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 	}
 	return dst
 }
+
+// MaxEncodedSize returns the expected maximum
+// size of an encoded block or stream.
+func (e *Encoder) MaxEncodedSize(size int) int {
+	frameHeader := 4 + 2 // magic + frame header & window descriptor
+	if e.o.dict != nil {
+		frameHeader += 4
+	}
+	// Frame content size:
+	if size < 256 {
+		frameHeader++
+	} else if size < 65536+256 {
+		frameHeader += 2
+	} else if size < math.MaxInt32 {
+		frameHeader += 4
+	} else {
+		frameHeader += 8
+	}
+	// Final crc
+	if e.o.crc {
+		frameHeader += 4
+	}
+
+	// Max overhead is 3 bytes/block.
+	// There cannot be 0 blocks.
+	blocks := (size + e.o.blockSize) / e.o.blockSize
+
+	// Combine, add padding.
+	maxSz := frameHeader + 3*blocks + size
+	if e.o.pad > 1 {
+		maxSz += calcSkippableFrame(int64(maxSz), int64(e.o.pad))
+	}
+	return maxSz
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index a7312f42af..20671dcb91 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -3,6 +3,8 @@ package zstd
 import (
 	"errors"
 	"fmt"
+	"math"
+	"math/bits"
 	"runtime"
 	"strings"
 )
@@ -24,33 +26,45 @@ type encoderOptions struct {
 	allLitEntropy   bool
 	customWindow    bool
 	customALEntropy bool
+	customBlockSize bool
+	lowMem          bool
 	dict            *dict
 }
 
 func (o *encoderOptions) setDefault() {
 	*o = encoderOptions{
-		// use less ram: true for now, but may change.
 		concurrent:    runtime.GOMAXPROCS(0),
 		crc:           true,
 		single:        nil,
-		blockSize:     1 << 16,
+		blockSize:     maxCompressedBlockSize,
 		windowSize:    8 << 20,
 		level:         SpeedDefault,
-		allLitEntropy: true,
+		allLitEntropy: false,
+		lowMem:        false,
 	}
 }
 
 // encoder returns an encoder with the selected options.
 func (o encoderOptions) encoder() encoder {
 	switch o.level {
+	case SpeedFastest:
+		if o.dict != nil {
+			return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
+		}
+		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
+
 	case SpeedDefault:
-		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}}
+		if o.dict != nil {
+			return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}}
+		}
+		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
 	case SpeedBetterCompression:
-		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
+		if o.dict != nil {
+			return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
+		}
+		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
 	case SpeedBestCompression:
-		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
-	case SpeedFastest:
-		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
+		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
 	}
 	panic("unknown compression level")
 }
@@ -62,8 +76,9 @@ func WithEncoderCRC(b bool) EOption {
 }
 
 // WithEncoderConcurrency will set the concurrency,
-// meaning the maximum number of decoders to run concurrently.
+// meaning the maximum number of encoders to run concurrently.
 // The value supplied must be at least 1.
+// For streams, setting a value of 1 will disable async compression.
 // By default this will be set to GOMAXPROCS.
 func WithEncoderConcurrency(n int) EOption {
 	return func(o *encoderOptions) error {
@@ -79,7 +94,7 @@ func WithEncoderConcurrency(n int) EOption {
 // The value must be a power of two between MinWindowSize and MaxWindowSize.
 // A larger value will enable better compression but allocate more memory and,
 // for above-default values, take considerably longer.
-// The default value is determined by the compression level.
+// The default value is determined by the compression level and max 8MB.
 func WithWindowSize(n int) EOption {
 	return func(o *encoderOptions) error {
 		switch {
@@ -95,6 +110,7 @@ func WithWindowSize(n int) EOption {
 		o.customWindow = true
 		if o.blockSize > o.windowSize {
 			o.blockSize = o.windowSize
+			o.customBlockSize = true
 		}
 		return nil
 	}
@@ -113,7 +129,7 @@ func WithEncoderPadding(n int) EOption {
 		}
 		// No need to waste our time.
 		if n == 1 {
-			o.pad = 0
+			n = 0
 		}
 		if n > 1<<30 {
 			return fmt.Errorf("padding must less than 1GB (1<<30 bytes) ")
@@ -177,10 +193,9 @@ func EncoderLevelFromZstd(level int) EncoderLevel {
 		return SpeedDefault
 	case level >= 6 && level < 10:
 		return SpeedBetterCompression
-	case level >= 10:
-		return SpeedBetterCompression
+	default:
+		return SpeedBestCompression
 	}
-	return SpeedDefault
 }
 
 // String provides a string representation of the compression level.
@@ -211,16 +226,19 @@ func WithEncoderLevel(l EncoderLevel) EOption {
 			switch o.level {
 			case SpeedFastest:
 				o.windowSize = 4 << 20
+				if !o.customBlockSize {
+					o.blockSize = 1 << 16
+				}
 			case SpeedDefault:
 				o.windowSize = 8 << 20
 			case SpeedBetterCompression:
-				o.windowSize = 16 << 20
+				o.windowSize = 8 << 20
 			case SpeedBestCompression:
-				o.windowSize = 32 << 20
+				o.windowSize = 8 << 20
 			}
 		}
 		if !o.customALEntropy {
-			o.allLitEntropy = l > SpeedFastest
+			o.allLitEntropy = l > SpeedDefault
 		}
 
 		return nil
@@ -267,7 +285,7 @@ func WithNoEntropyCompression(b bool) EOption {
 // a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
 // For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
 // This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
-// If this is not specified, block encodes will automatically choose this based on the input size.
+// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
 // This setting has no effect on streamed encodes.
 func WithSingleSegment(b bool) EOption {
 	return func(o *encoderOptions) error {
@@ -276,8 +294,25 @@ func WithSingleSegment(b bool) EOption {
 	}
 }
 
+// WithLowerEncoderMem will trade in some memory cases trade less memory usage for
+// slower encoding speed.
+// This will not change the window size which is the primary function for reducing
+// memory usage. See WithWindowSize.
+func WithLowerEncoderMem(b bool) EOption {
+	return func(o *encoderOptions) error {
+		o.lowMem = b
+		return nil
+	}
+}
+
 // WithEncoderDict allows to register a dictionary that will be used for the encode.
+//
+// The slice dict must be in the [dictionary format] produced by
+// "zstd --train" from the Zstandard reference implementation.
+//
 // The encoder *may* choose to use no dictionary instead for certain payloads.
+//
+// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
 func WithEncoderDict(dict []byte) EOption {
 	return func(o *encoderOptions) error {
 		d, err := loadDict(dict)
@@ -288,3 +323,17 @@ func WithEncoderDict(dict []byte) EOption {
 		return nil
 	}
 }
+
+// WithEncoderDictRaw registers a dictionary that may be used by the encoder.
+//
+// The slice content may contain arbitrary data. It will be used as an initial
+// history.
+func WithEncoderDictRaw(id uint32, content []byte) EOption {
+	return func(o *encoderOptions) error {
+		if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
+			return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
+		}
+		o.dict = &dict{id: id, content: content, offsets: [3]int{1, 4, 8}}
+		return nil
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index fc4a566d39..e47af66e7c 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -5,30 +5,20 @@
 package zstd
 
 import (
-	"bytes"
+	"encoding/binary"
 	"encoding/hex"
 	"errors"
-	"hash"
 	"io"
-	"sync"
 
 	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
 type frameDec struct {
-	o      decoderOptions
-	crc    hash.Hash64
-	offset int64
+	o   decoderOptions
+	crc *xxhash.Digest
 
 	WindowSize uint64
 
-	// maxWindowSize is the maximum windows size to support.
-	// should never be bigger than max-int.
-	maxWindowSize uint64
-
-	// In order queue of blocks being decoded.
-	decoding chan *blockDec
-
 	// Frame history passed between blocks
 	history history
 
@@ -38,35 +28,32 @@ type frameDec struct {
 	bBuf byteBuf
 
 	FrameContentSize uint64
-	frameDone        sync.WaitGroup
 
-	DictionaryID  *uint32
+	DictionaryID  uint32
 	HasCheckSum   bool
 	SingleSegment bool
-
-	// asyncRunning indicates whether the async routine processes input on 'decoding'.
-	asyncRunningMu sync.Mutex
-	asyncRunning   bool
 }
 
 const (
-	// The minimum Window_Size is 1 KB.
+	// MinWindowSize is the minimum Window Size, which is 1 KB.
 	MinWindowSize = 1 << 10
+
+	// MaxWindowSize is the maximum encoder window size
+	// and the default decoder maximum window size.
 	MaxWindowSize = 1 << 29
 )
 
-var (
-	frameMagic          = []byte{0x28, 0xb5, 0x2f, 0xfd}
-	skippableFrameMagic = []byte{0x2a, 0x4d, 0x18}
+const (
+	frameMagic          = "\x28\xb5\x2f\xfd"
+	skippableFrameMagic = "\x2a\x4d\x18"
 )
 
 func newFrameDec(o decoderOptions) *frameDec {
-	d := frameDec{
-		o:             o,
-		maxWindowSize: MaxWindowSize,
+	if o.maxWindowSize > o.maxDecodedSize {
+		o.maxWindowSize = o.maxDecodedSize
 	}
-	if d.maxWindowSize > o.maxDecodedSize {
-		d.maxWindowSize = o.maxDecodedSize
+	d := frameDec{
+		o: o,
 	}
 	return &d
 }
@@ -78,50 +65,74 @@ func newFrameDec(o decoderOptions) *frameDec {
 func (d *frameDec) reset(br byteBuffer) error {
 	d.HasCheckSum = false
 	d.WindowSize = 0
-	var b []byte
+	var signature [4]byte
 	for {
-		b = br.readSmall(4)
-		if b == nil {
+		var err error
+		// Check if we can read more...
+		b, err := br.readSmall(1)
+		switch err {
+		case io.EOF, io.ErrUnexpectedEOF:
+			return io.EOF
+		case nil:
+			signature[0] = b[0]
+		default:
+			return err
+		}
+		// Read the rest, don't allow io.ErrUnexpectedEOF
+		b, err = br.readSmall(3)
+		switch err {
+		case io.EOF:
 			return io.EOF
+		case nil:
+			copy(signature[1:], b)
+		default:
+			return err
 		}
-		if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
-			if debug {
-				println("Not skippable", hex.EncodeToString(b), hex.EncodeToString(skippableFrameMagic))
+
+		if string(signature[1:4]) != skippableFrameMagic || signature[0]&0xf0 != 0x50 {
+			if debugDecoder {
+				println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString([]byte(skippableFrameMagic)))
 			}
 			// Break if not skippable frame.
 			break
 		}
 		// Read size to skip
-		b = br.readSmall(4)
-		if b == nil {
-			println("Reading Frame Size EOF")
-			return io.ErrUnexpectedEOF
+		b, err = br.readSmall(4)
+		if err != nil {
+			if debugDecoder {
+				println("Reading Frame Size", err)
+			}
+			return err
 		}
 		n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		println("Skipping frame with", n, "bytes.")
-		err := br.skipN(int(n))
+		err = br.skipN(int64(n))
 		if err != nil {
-			if debug {
+			if debugDecoder {
 				println("Reading discarded frame", err)
 			}
 			return err
 		}
 	}
-	if !bytes.Equal(b, frameMagic) {
-		println("Got magic numbers: ", b, "want:", frameMagic)
+	if string(signature[:]) != frameMagic {
+		if debugDecoder {
+			println("Got magic numbers: ", signature, "want:", []byte(frameMagic))
+		}
 		return ErrMagicMismatch
 	}
 
 	// Read Frame_Header_Descriptor
 	fhd, err := br.readByte()
 	if err != nil {
-		println("Reading Frame_Header_Descriptor", err)
+		if debugDecoder {
+			println("Reading Frame_Header_Descriptor", err)
+		}
 		return err
 	}
 	d.SingleSegment = fhd&(1<<5) != 0
 
 	if fhd&(1<<3) != 0 {
-		return errors.New("Reserved bit set on frame header")
+		return errors.New("reserved bit set on frame header")
 	}
 
 	// Read Window_Descriptor
@@ -130,10 +141,14 @@ func (d *frameDec) reset(br byteBuffer) error {
 	if !d.SingleSegment {
 		wd, err := br.readByte()
 		if err != nil {
-			println("Reading Window_Descriptor", err)
+			if debugDecoder {
+				println("Reading Window_Descriptor", err)
+			}
 			return err
 		}
-		printf("raw: %x, mantissa: %d, exponent: %d\n", wd, wd&7, wd>>3)
+		if debugDecoder {
+			printf("raw: %x, mantissa: %d, exponent: %d\n", wd, wd&7, wd>>3)
+		}
 		windowLog := 10 + (wd >> 3)
 		windowBase := uint64(1) << windowLog
 		windowAdd := (windowBase / 8) * uint64(wd&0x7)
@@ -142,20 +157,19 @@ func (d *frameDec) reset(br byteBuffer) error {
 
 	// Read Dictionary_ID
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
-	d.DictionaryID = nil
+	d.DictionaryID = 0
 	if size := fhd & 3; size != 0 {
 		if size == 3 {
 			size = 4
 		}
-		b = br.readSmall(int(size))
-		if b == nil {
-			if debug {
-				println("Reading Dictionary_ID", io.ErrUnexpectedEOF)
-			}
-			return io.ErrUnexpectedEOF
+
+		b, err := br.readSmall(int(size))
+		if err != nil {
+			println("Reading Dictionary_ID", err)
+			return err
 		}
 		var id uint32
-		switch size {
+		switch len(b) {
 		case 1:
 			id = uint32(b[0])
 		case 2:
@@ -163,14 +177,10 @@ func (d *frameDec) reset(br byteBuffer) error {
 		case 4:
 			id = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		}
-		if debug {
+		if debugDecoder {
 			println("Dict size", size, "ID:", id)
 		}
-		if id > 0 {
-			// ID 0 means "sorry, no dictionary anyway".
-			// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
-			d.DictionaryID = &id
-		}
+		d.DictionaryID = id
 	}
 
 	// Read Frame_Content_Size
@@ -185,14 +195,14 @@ func (d *frameDec) reset(br byteBuffer) error {
 	default:
 		fcsSize = 1 << v
 	}
-	d.FrameContentSize = 0
+	d.FrameContentSize = fcsUnknown
 	if fcsSize > 0 {
-		b := br.readSmall(fcsSize)
-		if b == nil {
-			println("Reading Frame content", io.ErrUnexpectedEOF)
-			return io.ErrUnexpectedEOF
+		b, err := br.readSmall(fcsSize)
+		if err != nil {
+			println("Reading Frame content", err)
+			return err
 		}
-		switch fcsSize {
+		switch len(b) {
 		case 1:
 			d.FrameContentSize = uint64(b[0])
 		case 2:
@@ -205,10 +215,11 @@ func (d *frameDec) reset(br byteBuffer) error {
 			d2 := uint32(b[4]) | (uint32(b[5]) << 8) | (uint32(b[6]) << 16) | (uint32(b[7]) << 24)
 			d.FrameContentSize = uint64(d1) | (uint64(d2) << 32)
 		}
-		if debug {
-			println("field size bits:", v, "fcsSize:", fcsSize, "FrameContentSize:", d.FrameContentSize, hex.EncodeToString(b[:fcsSize]), "singleseg:", d.SingleSegment, "window:", d.WindowSize)
+		if debugDecoder {
+			println("Read FCS:", d.FrameContentSize)
 		}
 	}
+
 	// Move this to shared.
 	d.HasCheckSum = fhd&(1<<2) != 0
 	if d.HasCheckSum {
@@ -218,29 +229,52 @@ func (d *frameDec) reset(br byteBuffer) error {
 		d.crc.Reset()
 	}
 
+	if d.WindowSize > d.o.maxWindowSize {
+		if debugDecoder {
+			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+		}
+		return ErrWindowSizeExceeded
+	}
+
 	if d.WindowSize == 0 && d.SingleSegment {
 		// We may not need window in this case.
 		d.WindowSize = d.FrameContentSize
 		if d.WindowSize < MinWindowSize {
 			d.WindowSize = MinWindowSize
 		}
+		if d.WindowSize > d.o.maxDecodedSize {
+			if debugDecoder {
+				printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+			}
+			return ErrDecoderSizeExceeded
+		}
 	}
 
-	if d.WindowSize > d.maxWindowSize {
-		printf("window size %d > max %d\n", d.WindowSize, d.maxWindowSize)
-		return ErrWindowSizeExceeded
-	}
 	// The minimum Window_Size is 1 KB.
 	if d.WindowSize < MinWindowSize {
-		println("got window size: ", d.WindowSize)
+		if debugDecoder {
+			println("got window size: ", d.WindowSize)
+		}
 		return ErrWindowSizeTooSmall
 	}
 	d.history.windowSize = int(d.WindowSize)
-	if d.o.lowMem && d.history.windowSize < maxBlockSize {
-		d.history.maxSize = d.history.windowSize * 2
+	if !d.o.lowMem || d.history.windowSize < maxBlockSize {
+		// Alloc 2x window size if not low-mem, or window size below 2MB.
+		d.history.allocFrameBuffer = d.history.windowSize * 2
 	} else {
-		d.history.maxSize = d.history.windowSize + maxBlockSize
+		if d.o.lowMem {
+			// Alloc with 1MB extra.
+			d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize/2
+		} else {
+			// Alloc with 2MB extra.
+			d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
+		}
 	}
+
+	if debugDecoder {
+		println("Frame: Dict:", d.DictionaryID, "FrameContentSize:", d.FrameContentSize, "singleseg:", d.SingleSegment, "window:", d.WindowSize, "crc:", d.HasCheckSum)
+	}
+
 	// history contains input - maybe we do something
 	d.rawInput = br
 	return nil
@@ -248,244 +282,131 @@ func (d *frameDec) reset(br byteBuffer) error {
 
 // next will start decoding the next block from stream.
 func (d *frameDec) next(block *blockDec) error {
-	if debug {
-		printf("decoding new block %p:%p", block, block.data)
+	if debugDecoder {
+		println("decoding new block")
 	}
 	err := block.reset(d.rawInput, d.WindowSize)
 	if err != nil {
 		println("block error:", err)
 		// Signal the frame decoder we have a problem.
-		d.sendErr(block, err)
+		block.sendErr(err)
 		return err
 	}
-	block.input <- struct{}{}
-	if debug {
-		println("next block:", block)
-	}
-	d.asyncRunningMu.Lock()
-	defer d.asyncRunningMu.Unlock()
-	if !d.asyncRunning {
-		return nil
-	}
-	if block.Last {
-		// We indicate the frame is done by sending io.EOF
-		d.decoding <- block
-		return io.EOF
-	}
-	d.decoding <- block
 	return nil
 }
 
-// sendEOF will queue an error block on the frame.
-// This will cause the frame decoder to return when it encounters the block.
-// Returns true if the decoder was added.
-func (d *frameDec) sendErr(block *blockDec, err error) bool {
-	d.asyncRunningMu.Lock()
-	defer d.asyncRunningMu.Unlock()
-	if !d.asyncRunning {
-		return false
-	}
-
-	println("sending error", err.Error())
-	block.sendErr(err)
-	d.decoding <- block
-	return true
-}
-
-// checkCRC will check the checksum if the frame has one.
+// checkCRC will check the checksum, assuming the frame has one.
 // Will return ErrCRCMismatch if crc check failed, otherwise nil.
 func (d *frameDec) checkCRC() error {
-	if !d.HasCheckSum {
-		return nil
-	}
-	var tmp [4]byte
-	got := d.crc.Sum64()
-	// Flip to match file order.
-	tmp[0] = byte(got >> 0)
-	tmp[1] = byte(got >> 8)
-	tmp[2] = byte(got >> 16)
-	tmp[3] = byte(got >> 24)
-
 	// We can overwrite upper tmp now
-	want := d.rawInput.readSmall(4)
-	if want == nil {
-		println("CRC missing?")
-		return io.ErrUnexpectedEOF
+	buf, err := d.rawInput.readSmall(4)
+	if err != nil {
+		println("CRC missing?", err)
+		return err
 	}
 
-	if !bytes.Equal(tmp[:], want) {
-		if debug {
-			println("CRC Check Failed:", tmp[:], "!=", want)
+	want := binary.LittleEndian.Uint32(buf[:4])
+	got := uint32(d.crc.Sum64())
+
+	if got != want {
+		if debugDecoder {
+			printf("CRC check failed: got %08x, want %08x\n", got, want)
 		}
 		return ErrCRCMismatch
 	}
-	if debug {
-		println("CRC ok", tmp[:])
+	if debugDecoder {
+		printf("CRC ok %08x\n", got)
 	}
 	return nil
 }
 
-func (d *frameDec) initAsync() {
-	if !d.o.lowMem && !d.SingleSegment {
-		// set max extra size history to 10MB.
-		d.history.maxSize = d.history.windowSize + maxBlockSize*5
-	}
-	// re-alloc if more than one extra block size.
-	if d.o.lowMem && cap(d.history.b) > d.history.maxSize+maxBlockSize {
-		d.history.b = make([]byte, 0, d.history.maxSize)
-	}
-	if cap(d.history.b) < d.history.maxSize {
-		d.history.b = make([]byte, 0, d.history.maxSize)
-	}
-	if cap(d.decoding) < d.o.concurrent {
-		d.decoding = make(chan *blockDec, d.o.concurrent)
-	}
-	if debug {
-		h := d.history
-		printf("history init. len: %d, cap: %d", len(h.b), cap(h.b))
-	}
-	d.asyncRunningMu.Lock()
-	d.asyncRunning = true
-	d.asyncRunningMu.Unlock()
-}
-
-// startDecoder will start decoding blocks and write them to the writer.
-// The decoder will stop as soon as an error occurs or at end of frame.
-// When the frame has finished decoding the *bufio.Reader
-// containing the remaining input will be sent on frameDec.frameDone.
-func (d *frameDec) startDecoder(output chan decodeOutput) {
-	written := int64(0)
-
-	defer func() {
-		d.asyncRunningMu.Lock()
-		d.asyncRunning = false
-		d.asyncRunningMu.Unlock()
-
-		// Drain the currently decoding.
-		d.history.error = true
-	flushdone:
-		for {
-			select {
-			case b := <-d.decoding:
-				b.history <- &d.history
-				output <- <-b.result
-			default:
-				break flushdone
-			}
-		}
-		println("frame decoder done, signalling done")
-		d.frameDone.Done()
-	}()
-	// Get decoder for first block.
-	block := <-d.decoding
-	block.history <- &d.history
-	for {
-		var next *blockDec
-		// Get result
-		r := <-block.result
-		if r.err != nil {
-			println("Result contained error", r.err)
-			output <- r
-			return
-		}
-		if debug {
-			println("got result, from ", d.offset, "to", d.offset+int64(len(r.b)))
-			d.offset += int64(len(r.b))
-		}
-		if !block.Last {
-			// Send history to next block
-			select {
-			case next = <-d.decoding:
-				if debug {
-					println("Sending ", len(d.history.b), "bytes as history")
-				}
-				next.history <- &d.history
-			default:
-				// Wait until we have sent the block, so
-				// other decoders can potentially get the decoder.
-				next = nil
-			}
-		}
-
-		// Add checksum, async to decoding.
-		if d.HasCheckSum {
-			n, err := d.crc.Write(r.b)
-			if err != nil {
-				r.err = err
-				if n != len(r.b) {
-					r.err = io.ErrShortWrite
-				}
-				output <- r
-				return
-			}
-		}
-		written += int64(len(r.b))
-		if d.SingleSegment && uint64(written) > d.FrameContentSize {
-			println("runDecoder: single segment and", uint64(written), ">", d.FrameContentSize)
-			r.err = ErrFrameSizeExceeded
-			output <- r
-			return
-		}
-		if block.Last {
-			r.err = d.checkCRC()
-			output <- r
-			return
-		}
-		output <- r
-		if next == nil {
-			// There was no decoder available, we wait for one now that we have sent to the writer.
-			if debug {
-				println("Sending ", len(d.history.b), " bytes as history")
-			}
-			next = <-d.decoding
-			next.history <- &d.history
-		}
-		block = next
+// consumeCRC skips over the checksum, assuming the frame has one.
+func (d *frameDec) consumeCRC() error {
+	_, err := d.rawInput.readSmall(4)
+	if err != nil {
+		println("CRC missing?", err)
 	}
+	return err
 }
 
-// runDecoder will create a sync decoder that will decode a block of data.
+// runDecoder will run the decoder for the remainder of the frame.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 	saved := d.history.b
 
 	// We use the history for output to avoid copying it.
 	d.history.b = dst
+	d.history.ignoreBuffer = len(dst)
 	// Store input length, so we only check new data.
 	crcStart := len(dst)
+	d.history.decoders.maxSyncLen = 0
+	if d.o.limitToCap {
+		d.history.decoders.maxSyncLen = uint64(cap(dst) - len(dst))
+	}
+	if d.FrameContentSize != fcsUnknown {
+		if !d.o.limitToCap || d.FrameContentSize+uint64(len(dst)) < d.history.decoders.maxSyncLen {
+			d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+		}
+		if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
+			if debugDecoder {
+				println("maxSyncLen:", d.history.decoders.maxSyncLen, "> maxDecodedSize:", d.o.maxDecodedSize)
+			}
+			return dst, ErrDecoderSizeExceeded
+		}
+		if debugDecoder {
+			println("maxSyncLen:", d.history.decoders.maxSyncLen)
+		}
+		if !d.o.limitToCap && uint64(cap(dst)) < d.history.decoders.maxSyncLen {
+			// Alloc for output
+			dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
+			copy(dst2, dst)
+			dst = dst2
+		}
+	}
 	var err error
 	for {
 		err = dec.reset(d.rawInput, d.WindowSize)
 		if err != nil {
 			break
 		}
-		if debug {
+		if debugDecoder {
 			println("next block:", dec)
 		}
 		err = dec.decodeBuf(&d.history)
-		if err != nil || dec.Last {
+		if err != nil {
+			break
+		}
+		if uint64(len(d.history.b)-crcStart) > d.o.maxDecodedSize {
+			println("runDecoder: maxDecodedSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.o.maxDecodedSize)
+			err = ErrDecoderSizeExceeded
 			break
 		}
-		if uint64(len(d.history.b)) > d.o.maxDecodedSize {
+		if d.o.limitToCap && len(d.history.b) > cap(dst) {
+			println("runDecoder: cap exceeded", uint64(len(d.history.b)), ">", cap(dst))
 			err = ErrDecoderSizeExceeded
 			break
 		}
-		if d.SingleSegment && uint64(len(d.history.b)) > d.o.maxDecodedSize {
-			println("runDecoder: single segment and", uint64(len(d.history.b)), ">", d.o.maxDecodedSize)
+		if uint64(len(d.history.b)-crcStart) > d.FrameContentSize {
+			println("runDecoder: FrameContentSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.FrameContentSize)
 			err = ErrFrameSizeExceeded
 			break
 		}
+		if dec.Last {
+			break
+		}
+		if debugDecoder {
+			println("runDecoder: FrameContentSize", uint64(len(d.history.b)-crcStart), "<=", d.FrameContentSize)
+		}
 	}
 	dst = d.history.b
 	if err == nil {
-		if d.HasCheckSum {
-			var n int
-			n, err = d.crc.Write(dst[crcStart:])
-			if err == nil {
-				if n != len(dst)-crcStart {
-					err = io.ErrShortWrite
-				} else {
-					err = d.checkCRC()
-				}
+		if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
+			err = ErrFrameSizeMismatch
+		} else if d.HasCheckSum {
+			if d.o.ignoreChecksum {
+				err = d.consumeCRC()
+			} else {
+				d.crc.Write(dst[crcStart:])
+				err = d.checkCRC()
 			}
 		}
 	}
diff --git a/vendor/github.com/klauspost/compress/zstd/frameenc.go b/vendor/github.com/klauspost/compress/zstd/frameenc.go
index 4ef7f5a3e3..667ca06794 100644
--- a/vendor/github.com/klauspost/compress/zstd/frameenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/frameenc.go
@@ -22,7 +22,7 @@ type frameHeader struct {
 
 const maxHeaderSize = 14
 
-func (f frameHeader) appendTo(dst []byte) ([]byte, error) {
+func (f frameHeader) appendTo(dst []byte) []byte {
 	dst = append(dst, frameMagic...)
 	var fhd uint8
 	if f.Checksum {
@@ -76,7 +76,7 @@ func (f frameHeader) appendTo(dst []byte) ([]byte, error) {
 		if f.SingleSegment {
 			dst = append(dst, uint8(f.ContentSize))
 		}
-		// Unless SingleSegment is set, framessizes < 256 are nto stored.
+		// Unless SingleSegment is set, framessizes < 256 are not stored.
 	case 1:
 		f.ContentSize -= 256
 		dst = append(dst, uint8(f.ContentSize), uint8(f.ContentSize>>8))
@@ -88,7 +88,7 @@ func (f frameHeader) appendTo(dst []byte) ([]byte, error) {
 	default:
 		panic("invalid fcs")
 	}
-	return dst, nil
+	return dst
 }
 
 const skippableFrameHeader = 4 + 4
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index e6d3d49b39..2f8860a722 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -5,8 +5,10 @@
 package zstd
 
 import (
+	"encoding/binary"
 	"errors"
 	"fmt"
+	"io"
 )
 
 const (
@@ -178,10 +180,32 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 		return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog)
 	}
 	b.advance((bitCount + 7) >> 3)
-	// println(s.norm[:s.symbolLen], s.symbolLen)
 	return s.buildDtable()
 }
 
+func (s *fseDecoder) mustReadFrom(r io.Reader) {
+	fatalErr := func(err error) {
+		if err != nil {
+			panic(err)
+		}
+	}
+	// 	dt             [maxTablesize]decSymbol // Decompression table.
+	//	symbolLen      uint16                  // Length of active part of the symbol table.
+	//	actualTableLog uint8                   // Selected tablelog.
+	//	maxBits        uint8                   // Maximum number of additional bits
+	//	// used for table creation to avoid allocations.
+	//	stateTable [256]uint16
+	//	norm       [maxSymbolValue + 1]int16
+	//	preDefined bool
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.dt))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.norm))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined))
+}
+
 // decSymbol contains information about a state entry,
 // Including the state offset base, the output symbol and
 // the number of bits to read for the low part of the destination state.
@@ -204,18 +228,10 @@ func (d decSymbol) newState() uint16 {
 	return uint16(d >> 16)
 }
 
-func (d decSymbol) baseline() uint32 {
-	return uint32(d >> 32)
-}
-
 func (d decSymbol) baselineInt() int {
 	return int(d >> 32)
 }
 
-func (d *decSymbol) set(nbits, addBits uint8, newState uint16, baseline uint32) {
-	*d = decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
-}
-
 func (d *decSymbol) setNBits(nBits uint8) {
 	const mask = 0xffffffffffffff00
 	*d = (*d & mask) | decSymbol(nBits)
@@ -231,11 +247,6 @@ func (d *decSymbol) setNewState(state uint16) {
 	*d = (*d & mask) | decSymbol(state)<<16
 }
 
-func (d *decSymbol) setBaseline(baseline uint32) {
-	const mask = 0xffffffff
-	*d = (*d & mask) | decSymbol(baseline)<<32
-}
-
 func (d *decSymbol) setExt(addBits uint8, baseline uint32) {
 	const mask = 0xffff00ff
 	*d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32)
@@ -257,68 +268,6 @@ func (s *fseDecoder) setRLE(symbol decSymbol) {
 	s.dt[0] = symbol
 }
 
-// buildDtable will build the decoding table.
-func (s *fseDecoder) buildDtable() error {
-	tableSize := uint32(1 << s.actualTableLog)
-	highThreshold := tableSize - 1
-	symbolNext := s.stateTable[:256]
-
-	// Init, lay down lowprob symbols
-	{
-		for i, v := range s.norm[:s.symbolLen] {
-			if v == -1 {
-				s.dt[highThreshold].setAddBits(uint8(i))
-				highThreshold--
-				symbolNext[i] = 1
-			} else {
-				symbolNext[i] = uint16(v)
-			}
-		}
-	}
-	// Spread symbols
-	{
-		tableMask := tableSize - 1
-		step := tableStep(tableSize)
-		position := uint32(0)
-		for ss, v := range s.norm[:s.symbolLen] {
-			for i := 0; i < int(v); i++ {
-				s.dt[position].setAddBits(uint8(ss))
-				position = (position + step) & tableMask
-				for position > highThreshold {
-					// lowprob area
-					position = (position + step) & tableMask
-				}
-			}
-		}
-		if position != 0 {
-			// position must reach all cells once, otherwise normalizedCounter is incorrect
-			return errors.New("corrupted input (position != 0)")
-		}
-	}
-
-	// Build Decoding table
-	{
-		tableSize := uint16(1 << s.actualTableLog)
-		for u, v := range s.dt[:tableSize] {
-			symbol := v.addBits()
-			nextState := symbolNext[symbol]
-			symbolNext[symbol] = nextState + 1
-			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
-			s.dt[u&maxTableMask].setNBits(nBits)
-			newState := (nextState << nBits) - tableSize
-			if newState > tableSize {
-				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
-			}
-			if newState == uint16(u) && nBits == 0 {
-				// Seems weird that this is possible with nbits > 0.
-				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
-			}
-			s.dt[u&maxTableMask].setNewState(newState)
-		}
-	}
-	return nil
-}
-
 // transform will transform the decoder table into a table usable for
 // decoding without having to apply the transformation while decoding.
 // The state will contain the base value and the number of bits to read.
@@ -352,34 +301,7 @@ func (s *fseState) init(br *bitReader, tableLog uint8, dt []decSymbol) {
 	s.state = dt[br.getBits(tableLog)]
 }
 
-// next returns the current symbol and sets the next state.
-// At least tablelog bits must be available in the bit reader.
-func (s *fseState) next(br *bitReader) {
-	lowBits := uint16(br.getBits(s.state.nbBits()))
-	s.state = s.dt[s.state.newState()+lowBits]
-}
-
-// finished returns true if all bits have been read from the bitstream
-// and the next state would require reading bits from the input.
-func (s *fseState) finished(br *bitReader) bool {
-	return br.finished() && s.state.nbBits() > 0
-}
-
-// final returns the current state symbol without decoding the next.
-func (s *fseState) final() (int, uint8) {
-	return s.state.baselineInt(), s.state.addBits()
-}
-
 // final returns the current state symbol without decoding the next.
 func (s decSymbol) final() (int, uint8) {
 	return s.baselineInt(), s.addBits()
 }
-
-// nextFast returns the next symbol and sets the next state.
-// This can only be used if no symbols are 0 bits.
-// At least tablelog bits must be available in the bit reader.
-func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
-	lowBits := uint16(br.getBitsFast(s.state.nbBits()))
-	s.state = s.dt[s.state.newState()+lowBits]
-	return s.state.baseline(), s.state.addBits()
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
new file mode 100644
index 0000000000..d04a829b0a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
@@ -0,0 +1,65 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package zstd
+
+import (
+	"fmt"
+)
+
+type buildDtableAsmContext struct {
+	// inputs
+	stateTable *uint16
+	norm       *int16
+	dt         *uint64
+
+	// outputs --- set by the procedure in the case of error;
+	// for interpretation please see the error handling part below
+	errParam1 uint64
+	errParam2 uint64
+}
+
+// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
+// Function returns non-zero exit code on error.
+//
+//go:noescape
+func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
+
+// please keep in sync with _generate/gen_fse.go
+const (
+	errorCorruptedNormalizedCounter = 1
+	errorNewStateTooBig             = 2
+	errorNewStateNoBits             = 3
+)
+
+// buildDtable will build the decoding table.
+func (s *fseDecoder) buildDtable() error {
+	ctx := buildDtableAsmContext{
+		stateTable: &s.stateTable[0],
+		norm:       &s.norm[0],
+		dt:         (*uint64)(&s.dt[0]),
+	}
+	code := buildDtable_asm(s, &ctx)
+
+	if code != 0 {
+		switch code {
+		case errorCorruptedNormalizedCounter:
+			position := ctx.errParam1
+			return fmt.Errorf("corrupted input (position=%d, expected 0)", position)
+
+		case errorNewStateTooBig:
+			newState := decSymbol(ctx.errParam1)
+			size := ctx.errParam2
+			return fmt.Errorf("newState (%d) outside table size (%d)", newState, size)
+
+		case errorNewStateNoBits:
+			newState := decSymbol(ctx.errParam1)
+			oldState := decSymbol(ctx.errParam2)
+			return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState)
+
+		default:
+			return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code)
+		}
+	}
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
new file mode 100644
index 0000000000..bcde398695
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
@@ -0,0 +1,126 @@
+// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+
+// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
+TEXT ·buildDtable_asm(SB), $0-24
+	MOVQ ctx+8(FP), CX
+	MOVQ s+0(FP), DI
+
+	// Load values
+	MOVBQZX 4098(DI), DX
+	XORQ    AX, AX
+	BTSQ    DX, AX
+	MOVQ    (CX), BX
+	MOVQ    16(CX), SI
+	LEAQ    -1(AX), R8
+	MOVQ    8(CX), CX
+	MOVWQZX 4096(DI), DI
+
+	// End load values
+	// Init, lay down lowprob symbols
+	XORQ R9, R9
+	JMP  init_main_loop_condition
+
+init_main_loop:
+	MOVWQSX (CX)(R9*2), R10
+	CMPW    R10, $-1
+	JNE     do_not_update_high_threshold
+	MOVB    R9, 1(SI)(R8*8)
+	DECQ    R8
+	MOVQ    $0x0000000000000001, R10
+
+do_not_update_high_threshold:
+	MOVW R10, (BX)(R9*2)
+	INCQ R9
+
+init_main_loop_condition:
+	CMPQ R9, DI
+	JL   init_main_loop
+
+	// Spread symbols
+	// Calculate table step
+	MOVQ AX, R9
+	SHRQ $0x01, R9
+	MOVQ AX, R10
+	SHRQ $0x03, R10
+	LEAQ 3(R9)(R10*1), R9
+
+	// Fill add bits values
+	LEAQ -1(AX), R10
+	XORQ R11, R11
+	XORQ R12, R12
+	JMP  spread_main_loop_condition
+
+spread_main_loop:
+	XORQ    R13, R13
+	MOVWQSX (CX)(R12*2), R14
+	JMP     spread_inner_loop_condition
+
+spread_inner_loop:
+	MOVB R12, 1(SI)(R11*8)
+
+adjust_position:
+	ADDQ R9, R11
+	ANDQ R10, R11
+	CMPQ R11, R8
+	JG   adjust_position
+	INCQ R13
+
+spread_inner_loop_condition:
+	CMPQ R13, R14
+	JL   spread_inner_loop
+	INCQ R12
+
+spread_main_loop_condition:
+	CMPQ  R12, DI
+	JL    spread_main_loop
+	TESTQ R11, R11
+	JZ    spread_check_ok
+	MOVQ  ctx+8(FP), AX
+	MOVQ  R11, 24(AX)
+	MOVQ  $+1, ret+16(FP)
+	RET
+
+spread_check_ok:
+	// Build Decoding table
+	XORQ DI, DI
+
+build_table_main_table:
+	MOVBQZX 1(SI)(DI*8), CX
+	MOVWQZX (BX)(CX*2), R8
+	LEAQ    1(R8), R9
+	MOVW    R9, (BX)(CX*2)
+	MOVQ    R8, R9
+	BSRQ    R9, R9
+	MOVQ    DX, CX
+	SUBQ    R9, CX
+	SHLQ    CL, R8
+	SUBQ    AX, R8
+	MOVB    CL, (SI)(DI*8)
+	MOVW    R8, 2(SI)(DI*8)
+	CMPQ    R8, AX
+	JLE     build_table_check1_ok
+	MOVQ    ctx+8(FP), CX
+	MOVQ    R8, 24(CX)
+	MOVQ    AX, 32(CX)
+	MOVQ    $+2, ret+16(FP)
+	RET
+
+build_table_check1_ok:
+	TESTB CL, CL
+	JNZ   build_table_check2_ok
+	CMPW  R8, DI
+	JNE   build_table_check2_ok
+	MOVQ  ctx+8(FP), AX
+	MOVQ  R8, 24(AX)
+	MOVQ  DI, 32(AX)
+	MOVQ  $+3, ret+16(FP)
+	RET
+
+build_table_check2_ok:
+	INCQ DI
+	CMPQ DI, AX
+	JL   build_table_main_table
+	MOVQ $+0, ret+16(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
new file mode 100644
index 0000000000..8adfebb029
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
@@ -0,0 +1,73 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package zstd
+
+import (
+	"errors"
+	"fmt"
+)
+
+// buildDtable will build the decoding table.
+func (s *fseDecoder) buildDtable() error {
+	tableSize := uint32(1 << s.actualTableLog)
+	highThreshold := tableSize - 1
+	symbolNext := s.stateTable[:256]
+
+	// Init, lay down lowprob symbols
+	{
+		for i, v := range s.norm[:s.symbolLen] {
+			if v == -1 {
+				s.dt[highThreshold].setAddBits(uint8(i))
+				highThreshold--
+				v = 1
+			}
+			symbolNext[i] = uint16(v)
+		}
+	}
+
+	// Spread symbols
+	{
+		tableMask := tableSize - 1
+		step := tableStep(tableSize)
+		position := uint32(0)
+		for ss, v := range s.norm[:s.symbolLen] {
+			for i := 0; i < int(v); i++ {
+				s.dt[position].setAddBits(uint8(ss))
+				for {
+					// lowprob area
+					position = (position + step) & tableMask
+					if position <= highThreshold {
+						break
+					}
+				}
+			}
+		}
+		if position != 0 {
+			// position must reach all cells once, otherwise normalizedCounter is incorrect
+			return errors.New("corrupted input (position != 0)")
+		}
+	}
+
+	// Build Decoding table
+	{
+		tableSize := uint16(1 << s.actualTableLog)
+		for u, v := range s.dt[:tableSize] {
+			symbol := v.addBits()
+			nextState := symbolNext[symbol]
+			symbolNext[symbol] = nextState + 1
+			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
+			s.dt[u&maxTableMask].setNBits(nBits)
+			newState := (nextState << nBits) - tableSize
+			if newState > tableSize {
+				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
+			}
+			if newState == uint16(u) && nBits == 0 {
+				// Seems weird that this is possible with nbits > 0.
+				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
+			}
+			s.dt[u&maxTableMask].setNewState(newState)
+		}
+	}
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
index aa9eba88b8..ab26326a8f 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@@ -62,9 +62,8 @@ func (s symbolTransform) String() string {
 // To indicate that you have populated the histogram call HistogramFinished
 // with the value of the highest populated symbol, as well as the number of entries
 // in the most populated entry. These are accepted at face value.
-// The returned slice will always be length 256.
-func (s *fseEncoder) Histogram() []uint32 {
-	return s.count[:]
+func (s *fseEncoder) Histogram() *[256]uint32 {
+	return &s.count
 }
 
 // HistogramFinished can be called to indicate that the histogram has been populated.
@@ -77,27 +76,12 @@ func (s *fseEncoder) HistogramFinished(maxSymbol uint8, maxCount int) {
 	s.clearCount = maxCount != 0
 }
 
-// prepare will prepare and allocate scratch tables used for both compression and decompression.
-func (s *fseEncoder) prepare() (*fseEncoder, error) {
-	if s == nil {
-		s = &fseEncoder{}
-	}
-	s.useRLE = false
-	if s.clearCount && s.maxCount == 0 {
-		for i := range s.count {
-			s.count[i] = 0
-		}
-		s.clearCount = false
-	}
-	return s, nil
-}
-
 // allocCtable will allocate tables needed for compression.
 // If existing tables a re big enough, they are simply re-used.
 func (s *fseEncoder) allocCtable() {
 	tableSize := 1 << s.actualTableLog
 	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < int(tableSize) {
+	if cap(s.ct.tableSymbol) < tableSize {
 		s.ct.tableSymbol = make([]byte, tableSize)
 	}
 	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
@@ -202,13 +186,13 @@ func (s *fseEncoder) buildCTable() error {
 			case 0:
 			case -1, 1:
 				symbolTT[i].deltaNbBits = tl
-				symbolTT[i].deltaFindState = int16(total - 1)
+				symbolTT[i].deltaFindState = total - 1
 				total++
 			default:
 				maxBitsOut := uint32(tableLog) - highBit(uint32(v-1))
 				minStatePlus := uint32(v) << maxBitsOut
 				symbolTT[i].deltaNbBits = (maxBitsOut << 16) - minStatePlus
-				symbolTT[i].deltaFindState = int16(total - v)
+				symbolTT[i].deltaFindState = total - v
 				total += v
 			}
 		}
@@ -229,7 +213,7 @@ func (s *fseEncoder) setRLE(val byte) {
 		deltaFindState: 0,
 		deltaNbBits:    0,
 	}
-	if debug {
+	if debugEncoder {
 		println("setRLE: val", val, "symbolTT", s.ct.symbolTT[val])
 	}
 	s.rleVal = val
@@ -353,8 +337,8 @@ func (s *fseEncoder) normalizeCount2(length int) error {
 		distributed  uint32
 		total        = uint32(length)
 		tableLog     = s.actualTableLog
-		lowThreshold = uint32(total >> tableLog)
-		lowOne       = uint32((total * 3) >> (tableLog + 1))
+		lowThreshold = total >> tableLog
+		lowOne       = (total * 3) >> (tableLog + 1)
 	)
 	for i, cnt := range s.count[:s.symbolLen] {
 		if cnt == 0 {
@@ -379,7 +363,7 @@ func (s *fseEncoder) normalizeCount2(length int) error {
 
 	if (total / toDistribute) > lowOne {
 		// risk of rounding to zero
-		lowOne = uint32((total * 3) / (toDistribute * 2))
+		lowOne = (total * 3) / (toDistribute * 2)
 		for i, cnt := range s.count[:s.symbolLen] {
 			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
 				s.norm[i] = 1
@@ -708,15 +692,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) {
 	im := int32((nbBitsOut << 16) - first.deltaNbBits)
 	lu := (im >> nbBitsOut) + int32(first.deltaFindState)
 	c.state = c.stateTable[lu]
-	return
-}
-
-// encode the output symbol provided and write it to the bitstream.
-func (c *cState) encode(symbolTT symbolTransform) {
-	nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
-	dstState := int32(c.state>>(nbBitsOut&15)) + int32(symbolTT.deltaFindState)
-	c.bw.addBits16NC(c.state, uint8(nbBitsOut))
-	c.state = c.stateTable[dstState]
 }
 
 // flush will write the tablelog to the output and flush the remaining full bytes.
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
index 6c17dc17f4..474cb77d2b 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
@@ -59,7 +59,7 @@ func fillBase(dst []baseOffset, base uint32, bits ...uint8) {
 	}
 	for i, bit := range bits {
 		if base > math.MaxInt32 {
-			panic(fmt.Sprintf("invalid decoding table, base overflows int32"))
+			panic("invalid decoding table, base overflows int32")
 		}
 
 		dst[i] = baseOffset{
diff --git a/vendor/github.com/klauspost/compress/zstd/hash.go b/vendor/github.com/klauspost/compress/zstd/hash.go
index 4a752067fc..5d73c21ebd 100644
--- a/vendor/github.com/klauspost/compress/zstd/hash.go
+++ b/vendor/github.com/klauspost/compress/zstd/hash.go
@@ -13,65 +13,23 @@ const (
 	prime8bytes = 0xcf1bbcdcb7a56463
 )
 
-// hashLen returns a hash of the lowest l bytes of u for a size size of h bytes.
-// l must be >=4 and <=8. Any other value will return hash for 4 bytes.
-// h should always be <32.
-// Preferably h and l should be a constant.
-// FIXME: This does NOT get resolved, if 'mls' is constant,
-//  so this cannot be used.
-func hashLen(u uint64, hashLog, mls uint8) uint32 {
+// hashLen returns a hash of the lowest mls bytes of with length output bits.
+// mls must be >=3 and <=8. Any other value will return hash for 4 bytes.
+// length should always be < 32.
+// Preferably length and mls should be a constant for inlining.
+func hashLen(u uint64, length, mls uint8) uint32 {
 	switch mls {
+	case 3:
+		return (uint32(u<<8) * prime3bytes) >> (32 - length)
 	case 5:
-		return hash5(u, hashLog)
+		return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length))
 	case 6:
-		return hash6(u, hashLog)
+		return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length))
 	case 7:
-		return hash7(u, hashLog)
+		return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length))
 	case 8:
-		return hash8(u, hashLog)
+		return uint32((u * prime8bytes) >> (64 - length))
 	default:
-		return hash4x64(u, hashLog)
+		return (uint32(u) * prime4bytes) >> (32 - length)
 	}
 }
-
-// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash3(u uint32, h uint8) uint32 {
-	return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
-}
-
-// hash4 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4(u uint32, h uint8) uint32 {
-	return (u * prime4bytes) >> ((32 - h) & 31)
-}
-
-// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4x64(u uint64, h uint8) uint32 {
-	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
-}
-
-// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash5(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
-}
-
-// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash6(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
-}
-
-// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash7(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
-}
-
-// hash8 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash8(u uint64, h uint8) uint32 {
-	return uint32((u * prime8bytes) >> ((64 - h) & 63))
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
index f783e32d25..09164856d2 100644
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ b/vendor/github.com/klauspost/compress/zstd/history.go
@@ -10,40 +10,48 @@ import (
 
 // history contains the information transferred between blocks.
 type history struct {
-	b             []byte
-	huffTree      *huff0.Scratch
-	recentOffsets [3]int
+	// Literal decompression
+	huffTree *huff0.Scratch
+
+	// Sequence decompression
 	decoders      sequenceDecs
-	windowSize    int
-	maxSize       int
-	error         bool
-	dict          *dict
+	recentOffsets [3]int
+
+	// History buffer...
+	b []byte
+
+	// ignoreBuffer is meant to ignore a number of bytes
+	// when checking for matches in history
+	ignoreBuffer int
+
+	windowSize       int
+	allocFrameBuffer int // needed?
+	error            bool
+	dict             *dict
 }
 
 // reset will reset the history to initial state of a frame.
 // The history must already have been initialized to the desired size.
 func (h *history) reset() {
 	h.b = h.b[:0]
+	h.ignoreBuffer = 0
 	h.error = false
 	h.recentOffsets = [3]int{1, 4, 8}
-	if f := h.decoders.litLengths.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-	}
-	if f := h.decoders.offsets.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-	}
-	if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-	}
-	h.decoders = sequenceDecs{}
+	h.decoders.freeDecoders()
+	h.decoders = sequenceDecs{br: h.decoders.br}
+	h.freeHuffDecoder()
+	h.huffTree = nil
+	h.dict = nil
+	//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
+}
+
+func (h *history) freeHuffDecoder() {
 	if h.huffTree != nil {
 		if h.dict == nil || h.dict.litEnc != h.huffTree {
 			huffDecoderPool.Put(h.huffTree)
+			h.huffTree = nil
 		}
 	}
-	h.huffTree = nil
-	h.dict = nil
-	//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
 }
 
 func (h *history) setDict(dict *dict) {
@@ -54,6 +62,7 @@ func (h *history) setDict(dict *dict) {
 	h.decoders.litLengths = dict.llDec
 	h.decoders.offsets = dict.ofDec
 	h.decoders.matchLengths = dict.mlDec
+	h.decoders.dict = dict.content
 	h.recentOffsets = dict.offsets
 	h.huffTree = dict.litEnc
 }
@@ -83,6 +92,24 @@ func (h *history) append(b []byte) {
 	copy(h.b[h.windowSize-len(b):], b)
 }
 
+// ensureBlock will ensure there is space for at least one block...
+func (h *history) ensureBlock() {
+	if cap(h.b) < h.allocFrameBuffer {
+		h.b = make([]byte, 0, h.allocFrameBuffer)
+		return
+	}
+
+	avail := cap(h.b) - len(h.b)
+	if avail >= h.windowSize || avail > maxCompressedBlockSize {
+		return
+	}
+	// Move data down so we only have window size left.
+	// We know we have less than window size in b at this point.
+	discard := len(h.b) - h.windowSize
+	copy(h.b, h.b[discard:])
+	h.b = h.b[:h.windowSize]
+}
+
 // append bytes to history without ever discarding anything.
 func (h *history) appendKeep(b []byte) {
 	h.b = append(h.b, b...)
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
index 69aa3bb587..777290d44c 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
@@ -2,12 +2,7 @@
 
 VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package.
 
-
-[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash)
-[![Build Status](https://travis-ci.org/cespare/xxhash.svg?branch=master)](https://travis-ci.org/cespare/xxhash)
-
-xxhash is a Go implementation of the 64-bit
-[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
+xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a
 high-quality hashing algorithm that is much faster than anything in the Go
 standard library.
 
@@ -28,31 +23,49 @@ func (*Digest) WriteString(string) (int, error)
 func (*Digest) Sum64() uint64
 ```
 
-This implementation provides a fast pure-Go implementation and an even faster
-assembly implementation for amd64.
+The package is written with optimized pure Go and also contains even faster
+assembly implementations for amd64 and arm64. If desired, the `purego` build tag
+opts into using the Go code even on those architectures.
+
+[xxHash]: http://cyan4973.github.io/xxHash/
+
+## Compatibility
+
+This package is in a module and the latest code is in version 2 of the module.
+You need a version of Go with at least "minimal module compatibility" to use
+github.com/cespare/xxhash/v2:
+
+* 1.9.7+ for Go 1.9
+* 1.10.3+ for Go 1.10
+* Go 1.11 or later
+
+I recommend using the latest release of Go.
 
 ## Benchmarks
 
 Here are some quick benchmarks comparing the pure-Go and assembly
 implementations of Sum64.
 
-| input size | purego | asm |
-| --- | --- | --- |
-| 5 B   |  979.66 MB/s |  1291.17 MB/s  |
-| 100 B | 7475.26 MB/s | 7973.40 MB/s  |
-| 4 KB  | 17573.46 MB/s | 17602.65 MB/s |
-| 10 MB | 17131.46 MB/s | 17142.16 MB/s |
+| input size | purego    | asm       |
+| ---------- | --------- | --------- |
+| 4 B        |  1.3 GB/s |  1.2 GB/s |
+| 16 B       |  2.9 GB/s |  3.5 GB/s |
+| 100 B      |  6.9 GB/s |  8.1 GB/s |
+| 4 KB       | 11.7 GB/s | 16.7 GB/s |
+| 10 MB      | 12.0 GB/s | 17.3 GB/s |
 
-These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using
-the following commands under Go 1.11.2:
+These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C
+CPU using the following commands under Go 1.19.2:
 
 ```
-$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes'
-$ go test -benchtime 10s -bench '/xxhash,direct,bytes'
+benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$')
+benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$')
 ```
 
 ## Projects using this package
 
 - [InfluxDB](https://github.com/influxdata/influxdb)
 - [Prometheus](https://github.com/prometheus/prometheus)
+- [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
 - [FreeCache](https://github.com/coocood/freecache)
+- [FastCache](https://github.com/VictoriaMetrics/fastcache)
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
index 426b9cac78..fc40c82001 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
@@ -18,19 +18,11 @@ const (
 	prime5 uint64 = 2870177450012600261
 )
 
-// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where
-// possible in the Go code is worth a small (but measurable) performance boost
-// by avoiding some MOVQs. Vars are needed for the asm and also are useful for
-// convenience in the Go code in a few places where we need to intentionally
-// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the
-// result overflows a uint64).
-var (
-	prime1v = prime1
-	prime2v = prime2
-	prime3v = prime3
-	prime4v = prime4
-	prime5v = prime5
-)
+// Store the primes in an array as well.
+//
+// The consts are used when possible in Go code to avoid MOVs but we need a
+// contiguous array of the assembly code.
+var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
 
 // Digest implements hash.Hash64.
 type Digest struct {
@@ -52,10 +44,10 @@ func New() *Digest {
 
 // Reset clears the Digest's state so that it can be reused.
 func (d *Digest) Reset() {
-	d.v1 = prime1v + prime2
+	d.v1 = primes[0] + prime2
 	d.v2 = prime2
 	d.v3 = 0
-	d.v4 = -prime1v
+	d.v4 = -primes[0]
 	d.total = 0
 	d.n = 0
 }
@@ -71,21 +63,23 @@ func (d *Digest) Write(b []byte) (n int, err error) {
 	n = len(b)
 	d.total += uint64(n)
 
+	memleft := d.mem[d.n&(len(d.mem)-1):]
+
 	if d.n+n < 32 {
 		// This new data doesn't even fill the current block.
-		copy(d.mem[d.n:], b)
+		copy(memleft, b)
 		d.n += n
 		return
 	}
 
 	if d.n > 0 {
 		// Finish off the partial block.
-		copy(d.mem[d.n:], b)
+		c := copy(memleft, b)
 		d.v1 = round(d.v1, u64(d.mem[0:8]))
 		d.v2 = round(d.v2, u64(d.mem[8:16]))
 		d.v3 = round(d.v3, u64(d.mem[16:24]))
 		d.v4 = round(d.v4, u64(d.mem[24:32]))
-		b = b[32-d.n:]
+		b = b[c:]
 		d.n = 0
 	}
 
@@ -135,21 +129,20 @@ func (d *Digest) Sum64() uint64 {
 
 	h += d.total
 
-	i, end := 0, d.n
-	for ; i+8 <= end; i += 8 {
-		k1 := round(0, u64(d.mem[i:i+8]))
+	b := d.mem[:d.n&(len(d.mem)-1)]
+	for ; len(b) >= 8; b = b[8:] {
+		k1 := round(0, u64(b[:8]))
 		h ^= k1
 		h = rol27(h)*prime1 + prime4
 	}
-	if i+4 <= end {
-		h ^= uint64(u32(d.mem[i:i+4])) * prime1
+	if len(b) >= 4 {
+		h ^= uint64(u32(b[:4])) * prime1
 		h = rol23(h)*prime2 + prime3
-		i += 4
+		b = b[4:]
 	}
-	for i < end {
-		h ^= uint64(d.mem[i]) * prime5
+	for ; len(b) > 0; b = b[1:] {
+		h ^= uint64(b[0]) * prime5
 		h = rol11(h) * prime1
-		i++
 	}
 
 	h ^= h >> 33
@@ -195,7 +188,6 @@ func (d *Digest) UnmarshalBinary(b []byte) error {
 	b, d.v4 = consumeUint64(b)
 	b, d.total = consumeUint64(b)
 	copy(d.mem[:], b)
-	b = b[len(d.mem):]
 	d.n = int(d.total % uint64(len(d.mem)))
 	return nil
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
index 2c9c5357a1..ddb63aa91b 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
@@ -1,215 +1,210 @@
+//go:build !appengine && gc && !purego && !noasm
 // +build !appengine
 // +build gc
 // +build !purego
+// +build !noasm
 
 #include "textflag.h"
 
-// Register allocation:
-// AX	h
-// CX	pointer to advance through b
-// DX	n
-// BX	loop end
-// R8	v1, k1
-// R9	v2
-// R10	v3
-// R11	v4
-// R12	tmp
-// R13	prime1v
-// R14	prime2v
-// R15	prime4v
-
-// round reads from and advances the buffer pointer in CX.
-// It assumes that R13 has prime1v and R14 has prime2v.
-#define round(r) \
-	MOVQ  (CX), R12 \
-	ADDQ  $8, CX    \
-	IMULQ R14, R12  \
-	ADDQ  R12, r    \
-	ROLQ  $31, r    \
-	IMULQ R13, r
-
-// mergeRound applies a merge round on the two registers acc and val.
-// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
-#define mergeRound(acc, val) \
-	IMULQ R14, val \
-	ROLQ  $31, val \
-	IMULQ R13, val \
-	XORQ  val, acc \
-	IMULQ R13, acc \
-	ADDQ  R15, acc
+// Registers:
+#define h      AX
+#define d      AX
+#define p      SI // pointer to advance through b
+#define n      DX
+#define end    BX // loop end
+#define v1     R8
+#define v2     R9
+#define v3     R10
+#define v4     R11
+#define x      R12
+#define prime1 R13
+#define prime2 R14
+#define prime4 DI
+
+#define round(acc, x) \
+	IMULQ prime2, x   \
+	ADDQ  x, acc      \
+	ROLQ  $31, acc    \
+	IMULQ prime1, acc
+
+// round0 performs the operation x = round(0, x).
+#define round0(x) \
+	IMULQ prime2, x \
+	ROLQ  $31, x    \
+	IMULQ prime1, x
+
+// mergeRound applies a merge round on the two registers acc and x.
+// It assumes that prime1, prime2, and prime4 have been loaded.
+#define mergeRound(acc, x) \
+	round0(x)         \
+	XORQ  x, acc      \
+	IMULQ prime1, acc \
+	ADDQ  prime4, acc
+
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that there is at least one block
+// to process.
+#define blockLoop() \
+loop:  \
+	MOVQ +0(p), x  \
+	round(v1, x)   \
+	MOVQ +8(p), x  \
+	round(v2, x)   \
+	MOVQ +16(p), x \
+	round(v3, x)   \
+	MOVQ +24(p), x \
+	round(v4, x)   \
+	ADDQ $32, p    \
+	CMPQ p, end    \
+	JLE  loop
 
 // func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOSPLIT, $0-32
+TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
 	// Load fixed primes.
-	MOVQ ·prime1v(SB), R13
-	MOVQ ·prime2v(SB), R14
-	MOVQ ·prime4v(SB), R15
+	MOVQ ·primes+0(SB), prime1
+	MOVQ ·primes+8(SB), prime2
+	MOVQ ·primes+24(SB), prime4
 
 	// Load slice.
-	MOVQ b_base+0(FP), CX
-	MOVQ b_len+8(FP), DX
-	LEAQ (CX)(DX*1), BX
+	MOVQ b_base+0(FP), p
+	MOVQ b_len+8(FP), n
+	LEAQ (p)(n*1), end
 
 	// The first loop limit will be len(b)-32.
-	SUBQ $32, BX
+	SUBQ $32, end
 
 	// Check whether we have at least one block.
-	CMPQ DX, $32
+	CMPQ n, $32
 	JLT  noBlocks
 
 	// Set up initial state (v1, v2, v3, v4).
-	MOVQ R13, R8
-	ADDQ R14, R8
-	MOVQ R14, R9
-	XORQ R10, R10
-	XORQ R11, R11
-	SUBQ R13, R11
-
-	// Loop until CX > BX.
-blockLoop:
-	round(R8)
-	round(R9)
-	round(R10)
-	round(R11)
-
-	CMPQ CX, BX
-	JLE  blockLoop
-
-	MOVQ R8, AX
-	ROLQ $1, AX
-	MOVQ R9, R12
-	ROLQ $7, R12
-	ADDQ R12, AX
-	MOVQ R10, R12
-	ROLQ $12, R12
-	ADDQ R12, AX
-	MOVQ R11, R12
-	ROLQ $18, R12
-	ADDQ R12, AX
-
-	mergeRound(AX, R8)
-	mergeRound(AX, R9)
-	mergeRound(AX, R10)
-	mergeRound(AX, R11)
+	MOVQ prime1, v1
+	ADDQ prime2, v1
+	MOVQ prime2, v2
+	XORQ v3, v3
+	XORQ v4, v4
+	SUBQ prime1, v4
+
+	blockLoop()
+
+	MOVQ v1, h
+	ROLQ $1, h
+	MOVQ v2, x
+	ROLQ $7, x
+	ADDQ x, h
+	MOVQ v3, x
+	ROLQ $12, x
+	ADDQ x, h
+	MOVQ v4, x
+	ROLQ $18, x
+	ADDQ x, h
+
+	mergeRound(h, v1)
+	mergeRound(h, v2)
+	mergeRound(h, v3)
+	mergeRound(h, v4)
 
 	JMP afterBlocks
 
 noBlocks:
-	MOVQ ·prime5v(SB), AX
+	MOVQ ·primes+32(SB), h
 
 afterBlocks:
-	ADDQ DX, AX
-
-	// Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
-	ADDQ $24, BX
-
-	CMPQ CX, BX
-	JG   fourByte
-
-wordLoop:
-	// Calculate k1.
-	MOVQ  (CX), R8
-	ADDQ  $8, CX
-	IMULQ R14, R8
-	ROLQ  $31, R8
-	IMULQ R13, R8
-
-	XORQ  R8, AX
-	ROLQ  $27, AX
-	IMULQ R13, AX
-	ADDQ  R15, AX
-
-	CMPQ CX, BX
-	JLE  wordLoop
-
-fourByte:
-	ADDQ $4, BX
-	CMPQ CX, BX
-	JG   singles
-
-	MOVL  (CX), R8
-	ADDQ  $4, CX
-	IMULQ R13, R8
-	XORQ  R8, AX
-
-	ROLQ  $23, AX
-	IMULQ R14, AX
-	ADDQ  ·prime3v(SB), AX
-
-singles:
-	ADDQ $4, BX
-	CMPQ CX, BX
+	ADDQ n, h
+
+	ADDQ $24, end
+	CMPQ p, end
+	JG   try4
+
+loop8:
+	MOVQ  (p), x
+	ADDQ  $8, p
+	round0(x)
+	XORQ  x, h
+	ROLQ  $27, h
+	IMULQ prime1, h
+	ADDQ  prime4, h
+
+	CMPQ p, end
+	JLE  loop8
+
+try4:
+	ADDQ $4, end
+	CMPQ p, end
+	JG   try1
+
+	MOVL  (p), x
+	ADDQ  $4, p
+	IMULQ prime1, x
+	XORQ  x, h
+
+	ROLQ  $23, h
+	IMULQ prime2, h
+	ADDQ  ·primes+16(SB), h
+
+try1:
+	ADDQ $4, end
+	CMPQ p, end
 	JGE  finalize
 
-singlesLoop:
-	MOVBQZX (CX), R12
-	ADDQ    $1, CX
-	IMULQ   ·prime5v(SB), R12
-	XORQ    R12, AX
+loop1:
+	MOVBQZX (p), x
+	ADDQ    $1, p
+	IMULQ   ·primes+32(SB), x
+	XORQ    x, h
+	ROLQ    $11, h
+	IMULQ   prime1, h
 
-	ROLQ  $11, AX
-	IMULQ R13, AX
-
-	CMPQ CX, BX
-	JL   singlesLoop
+	CMPQ p, end
+	JL   loop1
 
 finalize:
-	MOVQ  AX, R12
-	SHRQ  $33, R12
-	XORQ  R12, AX
-	IMULQ R14, AX
-	MOVQ  AX, R12
-	SHRQ  $29, R12
-	XORQ  R12, AX
-	IMULQ ·prime3v(SB), AX
-	MOVQ  AX, R12
-	SHRQ  $32, R12
-	XORQ  R12, AX
-
-	MOVQ AX, ret+24(FP)
+	MOVQ  h, x
+	SHRQ  $33, x
+	XORQ  x, h
+	IMULQ prime2, h
+	MOVQ  h, x
+	SHRQ  $29, x
+	XORQ  x, h
+	IMULQ ·primes+16(SB), h
+	MOVQ  h, x
+	SHRQ  $32, x
+	XORQ  x, h
+
+	MOVQ h, ret+24(FP)
 	RET
 
-// writeBlocks uses the same registers as above except that it uses AX to store
-// the d pointer.
-
 // func writeBlocks(d *Digest, b []byte) int
-TEXT ·writeBlocks(SB), NOSPLIT, $0-40
+TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
 	// Load fixed primes needed for round.
-	MOVQ ·prime1v(SB), R13
-	MOVQ ·prime2v(SB), R14
+	MOVQ ·primes+0(SB), prime1
+	MOVQ ·primes+8(SB), prime2
 
 	// Load slice.
-	MOVQ arg1_base+8(FP), CX
-	MOVQ arg1_len+16(FP), DX
-	LEAQ (CX)(DX*1), BX
-	SUBQ $32, BX
+	MOVQ b_base+8(FP), p
+	MOVQ b_len+16(FP), n
+	LEAQ (p)(n*1), end
+	SUBQ $32, end
 
 	// Load vN from d.
-	MOVQ arg+0(FP), AX
-	MOVQ 0(AX), R8   // v1
-	MOVQ 8(AX), R9   // v2
-	MOVQ 16(AX), R10 // v3
-	MOVQ 24(AX), R11 // v4
+	MOVQ s+0(FP), d
+	MOVQ 0(d), v1
+	MOVQ 8(d), v2
+	MOVQ 16(d), v3
+	MOVQ 24(d), v4
 
 	// We don't need to check the loop condition here; this function is
 	// always called with at least one block of data to process.
-blockLoop:
-	round(R8)
-	round(R9)
-	round(R10)
-	round(R11)
-
-	CMPQ CX, BX
-	JLE  blockLoop
+	blockLoop()
 
 	// Copy vN back to d.
-	MOVQ R8, 0(AX)
-	MOVQ R9, 8(AX)
-	MOVQ R10, 16(AX)
-	MOVQ R11, 24(AX)
-
-	// The number of bytes written is CX minus the old base pointer.
-	SUBQ arg1_base+8(FP), CX
-	MOVQ CX, ret+32(FP)
+	MOVQ v1, 0(d)
+	MOVQ v2, 8(d)
+	MOVQ v3, 16(d)
+	MOVQ v4, 24(d)
+
+	// The number of bytes written is p minus the old base pointer.
+	SUBQ b_base+8(FP), p
+	MOVQ p, ret+32(FP)
 
 	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
new file mode 100644
index 0000000000..ae7d4d3295
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
@@ -0,0 +1,184 @@
+//go:build !appengine && gc && !purego && !noasm
+// +build !appengine
+// +build gc
+// +build !purego
+// +build !noasm
+
+#include "textflag.h"
+
+// Registers:
+#define digest	R1
+#define h	R2 // return value
+#define p	R3 // input pointer
+#define n	R4 // input length
+#define nblocks	R5 // n / 32
+#define prime1	R7
+#define prime2	R8
+#define prime3	R9
+#define prime4	R10
+#define prime5	R11
+#define v1	R12
+#define v2	R13
+#define v3	R14
+#define v4	R15
+#define x1	R20
+#define x2	R21
+#define x3	R22
+#define x4	R23
+
+#define round(acc, x) \
+	MADD prime2, acc, x, acc \
+	ROR  $64-31, acc         \
+	MUL  prime1, acc
+
+// round0 performs the operation x = round(0, x).
+#define round0(x) \
+	MUL prime2, x \
+	ROR $64-31, x \
+	MUL prime1, x
+
+#define mergeRound(acc, x) \
+	round0(x)                     \
+	EOR  x, acc                   \
+	MADD acc, prime4, prime1, acc
+
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that n >= 32.
+#define blockLoop() \
+	LSR     $5, n, nblocks  \
+	PCALIGN $16             \
+	loop:                   \
+	LDP.P   16(p), (x1, x2) \
+	LDP.P   16(p), (x3, x4) \
+	round(v1, x1)           \
+	round(v2, x2)           \
+	round(v3, x3)           \
+	round(v4, x4)           \
+	SUB     $1, nblocks     \
+	CBNZ    nblocks, loop
+
+// func Sum64(b []byte) uint64
+TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
+	LDP b_base+0(FP), (p, n)
+
+	LDP  ·primes+0(SB), (prime1, prime2)
+	LDP  ·primes+16(SB), (prime3, prime4)
+	MOVD ·primes+32(SB), prime5
+
+	CMP  $32, n
+	CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
+	BLT  afterLoop
+
+	ADD  prime1, prime2, v1
+	MOVD prime2, v2
+	MOVD $0, v3
+	NEG  prime1, v4
+
+	blockLoop()
+
+	ROR $64-1, v1, x1
+	ROR $64-7, v2, x2
+	ADD x1, x2
+	ROR $64-12, v3, x3
+	ROR $64-18, v4, x4
+	ADD x3, x4
+	ADD x2, x4, h
+
+	mergeRound(h, v1)
+	mergeRound(h, v2)
+	mergeRound(h, v3)
+	mergeRound(h, v4)
+
+afterLoop:
+	ADD n, h
+
+	TBZ   $4, n, try8
+	LDP.P 16(p), (x1, x2)
+
+	round0(x1)
+
+	// NOTE: here and below, sequencing the EOR after the ROR (using a
+	// rotated register) is worth a small but measurable speedup for small
+	// inputs.
+	ROR  $64-27, h
+	EOR  x1 @> 64-27, h, h
+	MADD h, prime4, prime1, h
+
+	round0(x2)
+	ROR  $64-27, h
+	EOR  x2 @> 64-27, h, h
+	MADD h, prime4, prime1, h
+
+try8:
+	TBZ    $3, n, try4
+	MOVD.P 8(p), x1
+
+	round0(x1)
+	ROR  $64-27, h
+	EOR  x1 @> 64-27, h, h
+	MADD h, prime4, prime1, h
+
+try4:
+	TBZ     $2, n, try2
+	MOVWU.P 4(p), x2
+
+	MUL  prime1, x2
+	ROR  $64-23, h
+	EOR  x2 @> 64-23, h, h
+	MADD h, prime3, prime2, h
+
+try2:
+	TBZ     $1, n, try1
+	MOVHU.P 2(p), x3
+	AND     $255, x3, x1
+	LSR     $8, x3, x2
+
+	MUL prime5, x1
+	ROR $64-11, h
+	EOR x1 @> 64-11, h, h
+	MUL prime1, h
+
+	MUL prime5, x2
+	ROR $64-11, h
+	EOR x2 @> 64-11, h, h
+	MUL prime1, h
+
+try1:
+	TBZ   $0, n, finalize
+	MOVBU (p), x4
+
+	MUL prime5, x4
+	ROR $64-11, h
+	EOR x4 @> 64-11, h, h
+	MUL prime1, h
+
+finalize:
+	EOR h >> 33, h
+	MUL prime2, h
+	EOR h >> 29, h
+	MUL prime3, h
+	EOR h >> 32, h
+
+	MOVD h, ret+24(FP)
+	RET
+
+// func writeBlocks(s *Digest, b []byte) int
+TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
+	LDP ·primes+0(SB), (prime1, prime2)
+
+	// Load state. Assume v[1-4] are stored contiguously.
+	MOVD s+0(FP), digest
+	LDP  0(digest), (v1, v2)
+	LDP  16(digest), (v3, v4)
+
+	LDP b_base+8(FP), (p, n)
+
+	blockLoop()
+
+	// Store updated state.
+	STP (v1, v2), 0(digest)
+	STP (v3, v4), 16(digest)
+
+	BIC  $31, n
+	MOVD n, ret+32(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
similarity index 54%
rename from vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go
rename to vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
index 35318d7c46..d4221edf4f 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
@@ -1,6 +1,9 @@
+//go:build (amd64 || arm64) && !appengine && gc && !purego && !noasm
+// +build amd64 arm64
 // +build !appengine
 // +build gc
 // +build !purego
+// +build !noasm
 
 package xxhash
 
@@ -10,4 +13,4 @@ package xxhash
 func Sum64(b []byte) uint64
 
 //go:noescape
-func writeBlocks(*Digest, []byte) int
+func writeBlocks(s *Digest, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
index 4a5a821603..0be16cefc7 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
@@ -1,4 +1,5 @@
-// +build !amd64 appengine !gc purego
+//go:build (!amd64 && !arm64) || appengine || !gc || purego || noasm
+// +build !amd64,!arm64 appengine !gc purego noasm
 
 package xxhash
 
@@ -14,10 +15,10 @@ func Sum64(b []byte) uint64 {
 	var h uint64
 
 	if n >= 32 {
-		v1 := prime1v + prime2
+		v1 := primes[0] + prime2
 		v2 := prime2
 		v3 := uint64(0)
-		v4 := -prime1v
+		v4 := -primes[0]
 		for len(b) >= 32 {
 			v1 = round(v1, u64(b[0:8:len(b)]))
 			v2 = round(v2, u64(b[8:16:len(b)]))
@@ -36,19 +37,18 @@ func Sum64(b []byte) uint64 {
 
 	h += uint64(n)
 
-	i, end := 0, len(b)
-	for ; i+8 <= end; i += 8 {
-		k1 := round(0, u64(b[i:i+8:len(b)]))
+	for ; len(b) >= 8; b = b[8:] {
+		k1 := round(0, u64(b[:8]))
 		h ^= k1
 		h = rol27(h)*prime1 + prime4
 	}
-	if i+4 <= end {
-		h ^= uint64(u32(b[i:i+4:len(b)])) * prime1
+	if len(b) >= 4 {
+		h ^= uint64(u32(b[:4])) * prime1
 		h = rol23(h)*prime2 + prime3
-		i += 4
+		b = b[4:]
 	}
-	for ; i < end; i++ {
-		h ^= uint64(b[i]) * prime5
+	for ; len(b) > 0; b = b[1:] {
+		h ^= uint64(b[0]) * prime5
 		h = rol11(h) * prime1
 	}
 
diff --git a/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.go b/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.go
new file mode 100644
index 0000000000..f41932b7a4
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.go
@@ -0,0 +1,16 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package zstd
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+//
+//	len(a) <= len(b) and len(a) > 0
+//
+//go:noescape
+func matchLen(a []byte, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.s b/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.s
new file mode 100644
index 0000000000..0782b86e3d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.s
@@ -0,0 +1,66 @@
+// Copied from S2 implementation.
+
+//go:build !appengine && !noasm && gc && !noasm
+
+#include "textflag.h"
+
+// func matchLen(a []byte, b []byte) int
+TEXT ·matchLen(SB), NOSPLIT, $0-56
+	MOVQ a_base+0(FP), AX
+	MOVQ b_base+24(FP), CX
+	MOVQ a_len+8(FP), DX
+
+	// matchLen
+	XORL SI, SI
+	CMPL DX, $0x08
+	JB   matchlen_match4_standalone
+
+matchlen_loopback_standalone:
+	MOVQ (AX)(SI*1), BX
+	XORQ (CX)(SI*1), BX
+	JZ   matchlen_loop_standalone
+
+#ifdef GOAMD64_v3
+	TZCNTQ BX, BX
+#else
+	BSFQ BX, BX
+#endif
+	SHRL $0x03, BX
+	LEAL (SI)(BX*1), SI
+	JMP  gen_match_len_end
+
+matchlen_loop_standalone:
+	LEAL -8(DX), DX
+	LEAL 8(SI), SI
+	CMPL DX, $0x08
+	JAE  matchlen_loopback_standalone
+
+matchlen_match4_standalone:
+	CMPL DX, $0x04
+	JB   matchlen_match2_standalone
+	MOVL (AX)(SI*1), BX
+	CMPL (CX)(SI*1), BX
+	JNE  matchlen_match2_standalone
+	LEAL -4(DX), DX
+	LEAL 4(SI), SI
+
+matchlen_match2_standalone:
+	CMPL DX, $0x02
+	JB   matchlen_match1_standalone
+	MOVW (AX)(SI*1), BX
+	CMPW (CX)(SI*1), BX
+	JNE  matchlen_match1_standalone
+	LEAL -2(DX), DX
+	LEAL 2(SI), SI
+
+matchlen_match1_standalone:
+	CMPL DX, $0x01
+	JB   gen_match_len_end
+	MOVB (AX)(SI*1), BL
+	CMPB (CX)(SI*1), BL
+	JNE  gen_match_len_end
+	INCL SI
+
+gen_match_len_end:
+	MOVQ SI, ret+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go b/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go
new file mode 100644
index 0000000000..57b9c31c02
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go
@@ -0,0 +1,33 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package zstd
+
+import (
+	"encoding/binary"
+	"math/bits"
+)
+
+// matchLen returns the maximum common prefix length of a and b.
+// a must be the shortest of the two.
+func matchLen(a, b []byte) (n int) {
+	for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
+		diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
+		if diff != 0 {
+			return n + bits.TrailingZeros64(diff)>>3
+		}
+		n += 8
+	}
+
+	for i := range a {
+		if a[i] != b[i] {
+			break
+		}
+		n++
+	}
+	return n
+
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index b5c8ef1332..d7fe6d82d9 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -20,6 +20,10 @@ type seq struct {
 	llCode, mlCode, ofCode uint8
 }
 
+type seqVals struct {
+	ll, ml, mo int
+}
+
 func (s seq) String() string {
 	if s.offset <= 3 {
 		if s.offset == 0 {
@@ -61,16 +65,19 @@ type sequenceDecs struct {
 	offsets      sequenceDec
 	matchLengths sequenceDec
 	prevOffset   [3]int
-	hist         []byte
 	dict         []byte
 	literals     []byte
 	out          []byte
+	nSeqs        int
+	br           *bitReader
+	seqSize      int
 	windowSize   int
 	maxBits      uint8
+	maxSyncLen   uint64
 }
 
 // initialize all 3 decoders from the stream input.
-func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []byte) error {
+func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) error {
 	if err := s.litLengths.init(br); err != nil {
 		return errors.New("litLengths:" + err.Error())
 	}
@@ -80,8 +87,7 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 	if err := s.matchLengths.init(br); err != nil {
 		return errors.New("matchLengths:" + err.Error())
 	}
-	s.literals = literals
-	s.hist = hist.b
+	s.br = br
 	s.prevOffset = hist.recentOffsets
 	s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
 	s.windowSize = hist.windowSize
@@ -93,20 +99,153 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 	return nil
 }
 
+func (s *sequenceDecs) freeDecoders() {
+	if f := s.litLengths.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+		s.litLengths.fse = nil
+	}
+	if f := s.offsets.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+		s.offsets.fse = nil
+	}
+	if f := s.matchLengths.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+		s.matchLengths.fse = nil
+	}
+}
+
+// execute will execute the decoded sequence with the provided history.
+// The sequence must be evaluated before being sent.
+func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
+	if len(s.dict) == 0 {
+		return s.executeSimple(seqs, hist)
+	}
+
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize > cap(s.out) {
+		addBytes := s.seqSize + len(s.out)
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with hist %d, dict %d, literals: %d into %d bytes\n", len(seqs), len(hist), len(s.dict), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	for _, seq := range seqs {
+		// Add literals
+		copy(out[t:], s.literals[:seq.ll])
+		t += seq.ll
+		s.literals = s.literals[seq.ll:]
+
+		// Copy from dictionary...
+		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
+			if len(s.dict) == 0 {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
+			}
+
+			// we may be in dictionary.
+			dictO := len(s.dict) - (seq.mo - (t + len(hist)))
+			if dictO < 0 || dictO >= len(s.dict) {
+				return fmt.Errorf("match offset (%d) bigger than current history+dict (%d)", seq.mo, t+len(hist)+len(s.dict))
+			}
+			end := dictO + seq.ml
+			if end > len(s.dict) {
+				n := len(s.dict) - dictO
+				copy(out[t:], s.dict[dictO:])
+				t += n
+				seq.ml -= n
+			} else {
+				copy(out[t:], s.dict[dictO:end])
+				t += end - dictO
+				continue
+			}
+		}
+
+		// Copy from history.
+		if v := seq.mo - t; v > 0 {
+			// v is the start position in history from end.
+			start := len(hist) - v
+			if seq.ml > v {
+				// Some goes into current block.
+				// Copy remainder of history
+				copy(out[t:], hist[start:])
+				t += v
+				seq.ml -= v
+			} else {
+				copy(out[t:], hist[start:start+seq.ml])
+				t += seq.ml
+				continue
+			}
+		}
+		// We must be in current buffer now
+		if seq.ml > 0 {
+			start := t - seq.mo
+			if seq.ml <= t-start {
+				// No overlap
+				copy(out[t:], out[start:start+seq.ml])
+				t += seq.ml
+				continue
+			} else {
+				// Overlapping copy
+				// Extend destination slice and copy one byte at the time.
+				src := out[start : start+seq.ml]
+				dst := out[t:]
+				dst = dst[:len(src)]
+				t += len(src)
+				// Destination is the space we just added.
+				for i := range src {
+					dst[i] = src[i]
+				}
+			}
+		}
+	}
+
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
+
 // decode sequences from the stream with the provided history.
-func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
+func (s *sequenceDecs) decodeSync(hist []byte) error {
+	supported, err := s.decodeSyncSimple(hist)
+	if supported {
+		return err
+	}
+
+	br := s.br
+	seqs := s.nSeqs
 	startSize := len(s.out)
 	// Grab full sizes tables, to avoid bounds checks.
 	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+	out := s.out
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
 
+	if debugDecoder {
+		println("decodeSync: decoding", seqs, "sequences", br.remain(), "bits remain on stream")
+	}
 	for i := seqs - 1; i >= 0; i-- {
 		if br.overread() {
-			printf("reading sequence %d, exceeded available data\n", seqs-i)
+			printf("reading sequence %d, exceeded available data. Overread by %d\n", seqs-i, -br.remain())
 			return io.ErrUnexpectedEOF
 		}
 		var ll, mo, ml int
-		if br.off > 4+((maxOffsetBits+16+16)>>3) {
+		if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
 			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 
@@ -151,7 +290,7 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 
 					if temp == 0 {
 						// 0 is not valid; input is corrupted; force offset to 1
-						println("temp was 0")
+						println("WARNING: temp was 0")
 						temp = 1
 					}
 
@@ -176,44 +315,49 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 		if ll > len(s.literals) {
 			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, len(s.literals))
 		}
-		size := ll + ml + len(s.out)
+		size := ll + ml + len(out)
 		if size-startSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size", size)
+			return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 		}
-		if size > cap(s.out) {
-			// Not enough size, will be extremely rarely triggered,
+		if size > cap(out) {
+			// Not enough size, which can happen under high volume block streaming conditions
 			// but could be if destination slice is too small for sync operations.
-			// We add maxBlockSize to the capacity.
-			s.out = append(s.out, make([]byte, maxBlockSize)...)
-			s.out = s.out[:len(s.out)-maxBlockSize]
+			// over-allocating here can create a large amount of GC pressure so we try to keep
+			// it as contained as possible
+			used := len(out) - startSize
+			addBytes := 256 + ll + ml + used>>2
+			// Clamp to max block size.
+			if used+addBytes > maxBlockSize {
+				addBytes = maxBlockSize - used
+			}
+			out = append(out, make([]byte, addBytes)...)
+			out = out[:len(out)-addBytes]
 		}
 		if ml > maxMatchLen {
 			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
 		}
 
 		// Add literals
-		s.out = append(s.out, s.literals[:ll]...)
+		out = append(out, s.literals[:ll]...)
 		s.literals = s.literals[ll:]
-		out := s.out
 
 		if mo == 0 && ml > 0 {
 			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
 		}
 
-		if mo > len(s.out)+len(hist) || mo > s.windowSize {
+		if mo > len(out)+len(hist) || mo > s.windowSize {
 			if len(s.dict) == 0 {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
 			}
 
 			// we may be in dictionary.
-			dictO := len(s.dict) - (mo - (len(s.out) + len(hist)))
+			dictO := len(s.dict) - (mo - (len(out) + len(hist)))
 			if dictO < 0 || dictO >= len(s.dict) {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
 			}
 			end := dictO + ml
 			if end > len(s.dict) {
 				out = append(out, s.dict[dictO:]...)
-				mo -= len(s.dict) - dictO
 				ml -= len(s.dict) - dictO
 			} else {
 				out = append(out, s.dict[dictO:end]...)
@@ -224,26 +368,25 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 
 		// Copy from history.
 		// TODO: Blocks without history could be made to ignore this completely.
-		if v := mo - len(s.out); v > 0 {
+		if v := mo - len(out); v > 0 {
 			// v is the start position in history from end.
-			start := len(s.hist) - v
+			start := len(hist) - v
 			if ml > v {
 				// Some goes into current block.
 				// Copy remainder of history
-				out = append(out, s.hist[start:]...)
-				mo -= v
+				out = append(out, hist[start:]...)
 				ml -= v
 			} else {
-				out = append(out, s.hist[start:start+ml]...)
+				out = append(out, hist[start:start+ml]...)
 				ml = 0
 			}
 		}
 		// We must be in current buffer now
 		if ml > 0 {
-			start := len(s.out) - mo
-			if ml <= len(s.out)-start {
+			start := len(out) - mo
+			if ml <= len(out)-start {
 				// No overlap
-				out = append(out, s.out[start:start+ml]...)
+				out = append(out, out[start:start+ml]...)
 			} else {
 				// Overlapping copy
 				// Extend destination slice and copy one byte at the time.
@@ -257,7 +400,6 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 				}
 			}
 		}
-		s.out = out
 		if i == 0 {
 			// This is the last sequence, so we shouldn't update state.
 			break
@@ -271,7 +413,8 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			mlState = mlTable[mlState.newState()&maxTableMask]
 			ofState = ofTable[ofState.newState()&maxTableMask]
 		} else {
-			bits := br.getBitsFast(nBits)
+			bits := br.get32BitsFast(nBits)
+
 			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
 			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
 
@@ -284,19 +427,13 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 		}
 	}
 
-	// Add final literals
-	s.out = append(s.out, s.literals...)
-	return nil
-}
+	if size := len(s.literals) + len(out) - startSize; size > maxBlockSize {
+		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
+	}
 
-// update states, at least 27 bits must be available.
-func (s *sequenceDecs) update(br *bitReader) {
-	// Max 8 bits
-	s.litLengths.state.next(br)
-	// Max 9 bits
-	s.matchLengths.state.next(br)
-	// Max 8 bits
-	s.offsets.state.next(br)
+	// Add final literals
+	s.out = append(out, s.literals...)
+	return br.close()
 }
 
 var bitMask [16]uint16
@@ -307,107 +444,21 @@ func init() {
 	}
 }
 
-// update states, at least 27 bits must be available.
-func (s *sequenceDecs) updateAlt(br *bitReader) {
-	// Update all 3 states at once. Approx 20% faster.
-	a, b, c := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
-
-	nBits := a.nbBits() + b.nbBits() + c.nbBits()
-	if nBits == 0 {
-		s.litLengths.state.state = s.litLengths.state.dt[a.newState()]
-		s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()]
-		s.offsets.state.state = s.offsets.state.dt[c.newState()]
-		return
-	}
-	bits := br.getBitsFast(nBits)
-	lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31))
-	s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits]
-
-	lowBits = uint16(bits >> (c.nbBits() & 31))
-	lowBits &= bitMask[b.nbBits()&15]
-	s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()+lowBits]
-
-	lowBits = uint16(bits) & bitMask[c.nbBits()&15]
-	s.offsets.state.state = s.offsets.state.dt[c.newState()+lowBits]
-}
-
-// nextFast will return new states when there are at least 4 unused bytes left on the stream when done.
-func (s *sequenceDecs) nextFast(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
+func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
 	// Final will not read from stream.
 	ll, llB := llState.final()
 	ml, mlB := mlState.final()
 	mo, moB := ofState.final()
 
 	// extra bits are stored in reverse order.
-	br.fillFast()
+	br.fill()
 	mo += br.getBits(moB)
 	if s.maxBits > 32 {
-		br.fillFast()
+		br.fill()
 	}
+	// matchlength+literal length, max 32 bits
 	ml += br.getBits(mlB)
 	ll += br.getBits(llB)
-
-	if moB > 1 {
-		s.prevOffset[2] = s.prevOffset[1]
-		s.prevOffset[1] = s.prevOffset[0]
-		s.prevOffset[0] = mo
-		return
-	}
-	// mo = s.adjustOffset(mo, ll, moB)
-	// Inlined for rather big speedup
-	if ll == 0 {
-		// There is an exception though, when current sequence's literals_length = 0.
-		// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
-		// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
-		mo++
-	}
-
-	if mo == 0 {
-		mo = s.prevOffset[0]
-		return
-	}
-	var temp int
-	if mo == 3 {
-		temp = s.prevOffset[0] - 1
-	} else {
-		temp = s.prevOffset[mo]
-	}
-
-	if temp == 0 {
-		// 0 is not valid; input is corrupted; force offset to 1
-		println("temp was 0")
-		temp = 1
-	}
-
-	if mo != 1 {
-		s.prevOffset[2] = s.prevOffset[1]
-	}
-	s.prevOffset[1] = s.prevOffset[0]
-	s.prevOffset[0] = temp
-	mo = temp
-	return
-}
-
-func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
-	// Final will not read from stream.
-	ll, llB := llState.final()
-	ml, mlB := mlState.final()
-	mo, moB := ofState.final()
-
-	// extra bits are stored in reverse order.
-	br.fill()
-	if s.maxBits <= 32 {
-		mo += br.getBits(moB)
-		ml += br.getBits(mlB)
-		ll += br.getBits(llB)
-	} else {
-		mo += br.getBits(moB)
-		br.fill()
-		// matchlength+literal length, max 32 bits
-		ml += br.getBits(mlB)
-		ll += br.getBits(llB)
-
-	}
 	mo = s.adjustOffset(mo, ll, moB)
 	return
 }
@@ -450,36 +501,3 @@ func (s *sequenceDecs) adjustOffset(offset, litLen int, offsetB uint8) int {
 	s.prevOffset[0] = temp
 	return temp
 }
-
-// mergeHistory will merge history.
-func (s *sequenceDecs) mergeHistory(hist *sequenceDecs) (*sequenceDecs, error) {
-	for i := uint(0); i < 3; i++ {
-		var sNew, sHist *sequenceDec
-		switch i {
-		default:
-			// same as "case 0":
-			sNew = &s.litLengths
-			sHist = &hist.litLengths
-		case 1:
-			sNew = &s.offsets
-			sHist = &hist.offsets
-		case 2:
-			sNew = &s.matchLengths
-			sHist = &hist.matchLengths
-		}
-		if sNew.repeat {
-			if sHist.fse == nil {
-				return nil, fmt.Errorf("sequence stream %d, repeat requested, but no history", i)
-			}
-			continue
-		}
-		if sNew.fse == nil {
-			return nil, fmt.Errorf("sequence stream %d, no fse found", i)
-		}
-		if sHist.fse != nil && !sHist.fse.preDefined {
-			fseDecoderPool.Put(sHist.fse)
-		}
-		sHist.fse = sNew.fse
-	}
-	return hist, nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
new file mode 100644
index 0000000000..c59f17e07a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@@ -0,0 +1,394 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package zstd
+
+import (
+	"fmt"
+	"io"
+
+	"github.com/klauspost/compress/internal/cpuinfo"
+)
+
+type decodeSyncAsmContext struct {
+	llTable     []decSymbol
+	mlTable     []decSymbol
+	ofTable     []decSymbol
+	llState     uint64
+	mlState     uint64
+	ofState     uint64
+	iteration   int
+	litRemain   int
+	out         []byte
+	outPosition int
+	literals    []byte
+	litPosition int
+	history     []byte
+	windowSize  int
+	ll          int // set on error (not for all errors, please refer to _generate/gen.go)
+	ml          int // set on error (not for all errors, please refer to _generate/gen.go)
+	mo          int // set on error (not for all errors, please refer to _generate/gen.go)
+}
+
+// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//
+//go:noescape
+func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
+//
+//go:noescape
+func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
+//
+//go:noescape
+func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
+//
+//go:noescape
+func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// decode sequences from the stream with the provided history but without a dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+	if len(s.dict) > 0 {
+		return false, nil
+	}
+	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
+		return false, nil
+	}
+
+	// FIXME: Using unsafe memory copies leads to rare, random crashes
+	// with fuzz testing. It is therefore disabled for now.
+	const useSafe = true
+	/*
+		useSafe := false
+		if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
+			useSafe = true
+		}
+		if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
+			useSafe = true
+		}
+		if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+			useSafe = true
+		}
+	*/
+
+	br := s.br
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+
+	ctx := decodeSyncAsmContext{
+		llTable:     s.litLengths.fse.dt[:maxTablesize],
+		mlTable:     s.matchLengths.fse.dt[:maxTablesize],
+		ofTable:     s.offsets.fse.dt[:maxTablesize],
+		llState:     uint64(s.litLengths.state.state),
+		mlState:     uint64(s.matchLengths.state.state),
+		ofState:     uint64(s.offsets.state.state),
+		iteration:   s.nSeqs - 1,
+		litRemain:   len(s.literals),
+		out:         s.out,
+		outPosition: len(s.out),
+		literals:    s.literals,
+		windowSize:  s.windowSize,
+		history:     hist,
+	}
+
+	s.seqSize = 0
+	startSize := len(s.out)
+
+	var errCode int
+	if cpuinfo.HasBMI2() {
+		if useSafe {
+			errCode = sequenceDecs_decodeSync_safe_bmi2(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx)
+		}
+	} else {
+		if useSafe {
+			errCode = sequenceDecs_decodeSync_safe_amd64(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx)
+		}
+	}
+	switch errCode {
+	case noError:
+		break
+
+	case errorMatchLenOfsMismatch:
+		return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml)
+
+	case errorMatchLenTooBig:
+		return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml)
+
+	case errorMatchOffTooBig:
+		return true, fmt.Errorf("match offset (%d) bigger than current history (%d)",
+			ctx.mo, ctx.outPosition+len(hist)-startSize)
+
+	case errorNotEnoughLiterals:
+		return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
+			ctx.ll, ctx.litRemain+ctx.ll)
+
+	case errorOverread:
+		return true, io.ErrUnexpectedEOF
+
+	case errorNotEnoughSpace:
+		size := ctx.outPosition + ctx.ll + ctx.ml
+		if debugDecoder {
+			println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
+		}
+		return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
+
+	default:
+		return true, fmt.Errorf("sequenceDecs_decode returned erroneous code %d", errCode)
+	}
+
+	s.seqSize += ctx.litRemain
+	if s.seqSize > maxBlockSize {
+		return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+		return true, err
+	}
+
+	s.literals = s.literals[ctx.litPosition:]
+	t := ctx.outPosition
+	s.out = s.out[:t]
+
+	// Add final literals
+	s.out = append(s.out, s.literals...)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(s.out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t))
+		}
+	}
+
+	return true, nil
+}
+
+// --------------------------------------------------------------------------------
+
+type decodeAsmContext struct {
+	llTable   []decSymbol
+	mlTable   []decSymbol
+	ofTable   []decSymbol
+	llState   uint64
+	mlState   uint64
+	ofState   uint64
+	iteration int
+	seqs      []seqVals
+	litRemain int
+}
+
+const noError = 0
+
+// error reported when mo == 0 && ml > 0
+const errorMatchLenOfsMismatch = 1
+
+// error reported when ml > maxMatchLen
+const errorMatchLenTooBig = 2
+
+// error reported when mo > available history or mo > s.windowSize
+const errorMatchOffTooBig = 3
+
+// error reported when the sum of literal lengths exeeceds the literal buffer size
+const errorNotEnoughLiterals = 4
+
+// error reported when capacity of `out` is too small
+const errorNotEnoughSpace = 5
+
+// error reported when bits are overread.
+const errorOverread = 6
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//
+//go:noescape
+func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//
+//go:noescape
+func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//
+//go:noescape
+func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//
+//go:noescape
+func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+	br := s.br
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+
+	ctx := decodeAsmContext{
+		llTable:   s.litLengths.fse.dt[:maxTablesize],
+		mlTable:   s.matchLengths.fse.dt[:maxTablesize],
+		ofTable:   s.offsets.fse.dt[:maxTablesize],
+		llState:   uint64(s.litLengths.state.state),
+		mlState:   uint64(s.matchLengths.state.state),
+		ofState:   uint64(s.offsets.state.state),
+		seqs:      seqs,
+		iteration: len(seqs) - 1,
+		litRemain: len(s.literals),
+	}
+
+	if debugDecoder {
+		println("decode: decoding", len(seqs), "sequences", br.remain(), "bits remain on stream")
+	}
+
+	s.seqSize = 0
+	lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
+	var errCode int
+	if cpuinfo.HasBMI2() {
+		if lte56bits {
+			errCode = sequenceDecs_decode_56_bmi2(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decode_bmi2(s, br, &ctx)
+		}
+	} else {
+		if lte56bits {
+			errCode = sequenceDecs_decode_56_amd64(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decode_amd64(s, br, &ctx)
+		}
+	}
+	if errCode != 0 {
+		i := len(seqs) - ctx.iteration - 1
+		switch errCode {
+		case errorMatchLenOfsMismatch:
+			ml := ctx.seqs[i].ml
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+
+		case errorMatchLenTooBig:
+			ml := ctx.seqs[i].ml
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+
+		case errorNotEnoughLiterals:
+			ll := ctx.seqs[i].ll
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
+		case errorOverread:
+			return io.ErrUnexpectedEOF
+		}
+
+		return fmt.Errorf("sequenceDecs_decode_amd64 returned erroneous code %d", errCode)
+	}
+
+	if ctx.litRemain < 0 {
+		return fmt.Errorf("literal count is too big: total available %d, total requested %d",
+			len(s.literals), len(s.literals)-ctx.litRemain)
+	}
+
+	s.seqSize += ctx.litRemain
+	if s.seqSize > maxBlockSize {
+		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
+	}
+	if debugDecoder {
+		println("decode: ", br.remain(), "bits remain on stream. code:", errCode)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+	}
+	return err
+}
+
+// --------------------------------------------------------------------------------
+
+type executeAsmContext struct {
+	seqs        []seqVals
+	seqIndex    int
+	out         []byte
+	history     []byte
+	literals    []byte
+	outPosition int
+	litPosition int
+	windowSize  int
+}
+
+// sequenceDecs_executeSimple_amd64 implements the main loop of sequenceDecs.executeSimple in x86 asm.
+//
+// Returns false if a match offset is too big.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//
+//go:noescape
+func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+
+// Same as above, but with safe memcopies
+//
+//go:noescape
+func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
+
+// executeSimple handles cases when dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize+compressedBlockOverAlloc > cap(s.out) {
+		addBytes := s.seqSize + len(s.out) + compressedBlockOverAlloc
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	ctx := executeAsmContext{
+		seqs:        seqs,
+		seqIndex:    0,
+		out:         out,
+		history:     hist,
+		outPosition: t,
+		litPosition: 0,
+		literals:    s.literals,
+		windowSize:  s.windowSize,
+	}
+	var ok bool
+	if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
+		ok = sequenceDecs_executeSimple_safe_amd64(&ctx)
+	} else {
+		ok = sequenceDecs_executeSimple_amd64(&ctx)
+	}
+	if !ok {
+		return fmt.Errorf("match offset (%d) bigger than current history (%d)",
+			seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
+	}
+	s.literals = s.literals[ctx.litPosition:]
+	t = ctx.outPosition
+
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
new file mode 100644
index 0000000000..f5591fa1e8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -0,0 +1,4151 @@
+// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+
+// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_amd64(SB), $8-32
+	MOVQ    br+8(FP), CX
+	MOVQ    24(CX), DX
+	MOVBQZX 32(CX), BX
+	MOVQ    (CX), AX
+	MOVQ    8(CX), SI
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    104(AX), R10
+	MOVQ    s+0(FP), AX
+	MOVQ    144(AX), R11
+	MOVQ    152(AX), R12
+	MOVQ    160(AX), R13
+
+sequenceDecs_decode_amd64_main_loop:
+	MOVQ (SP), R14
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_amd64_fill_end
+
+sequenceDecs_decode_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_amd64_fill_check_overread
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
+sequenceDecs_decode_amd64_fill_end:
+	// Update offset
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_amd64_of_update_zero:
+	MOVQ AX, 16(R10)
+
+	// Update match length
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_amd64_ml_update_zero:
+	MOVQ AX, 8(R10)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_amd64_fill_2_end
+
+sequenceDecs_decode_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_amd64_fill_2_check_overread
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_2_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
+sequenceDecs_decode_amd64_fill_2_end:
+	// Update literal length
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_amd64_ll_update_zero:
+	MOVQ AX, (R10)
+
+	// Fill bitreader for state updates
+	MOVQ    R14, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decode_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R14
+	SHRL    $0x10, DI
+	LEAQ    (BX)(R14*1), CX
+	MOVQ    DX, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
+	ADDQ    R15, DI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R14
+	SHRL    $0x10, R8
+	LEAQ    (BX)(R14*1), CX
+	MOVQ    DX, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
+	ADDQ    R15, R8
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R14
+	SHRL    $0x10, R9
+	LEAQ    (BX)(R14*1), CX
+	MOVQ    DX, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
+	ADDQ    R15, R9
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_amd64_skip_update:
+	// Adjust offset
+	MOVQ 16(R10), CX
+	CMPQ AX, $0x01
+	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
+	MOVQ R12, R13
+	MOVQ R11, R12
+	MOVQ CX, R11
+	JMP  sequenceDecs_decode_amd64_after_adjust
+
+sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
+	CMPQ (R10), $0x00000000
+	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_amd64_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
+	MOVQ  R11, CX
+	JMP   sequenceDecs_decode_amd64_after_adjust
+
+sequenceDecs_decode_amd64_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_amd64_adjust_zero
+	JEQ  sequenceDecs_decode_amd64_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_amd64_adjust_three
+	JMP  sequenceDecs_decode_amd64_adjust_two
+
+sequenceDecs_decode_amd64_adjust_zero:
+	MOVQ R11, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_one:
+	MOVQ R12, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_two:
+	MOVQ R13, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_three:
+	LEAQ -1(R11), AX
+
+sequenceDecs_decode_amd64_adjust_test_temp_valid:
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
+	MOVQ  $0x00000001, AX
+
+sequenceDecs_decode_amd64_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R12, R13
+	MOVQ    R11, R12
+	MOVQ    AX, R11
+	MOVQ    AX, CX
+
+sequenceDecs_decode_amd64_after_adjust:
+	MOVQ CX, 16(R10)
+
+	// Check values
+	MOVQ  8(R10), AX
+	MOVQ  (R10), R14
+	LEAQ  (AX)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decode_amd64_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_amd64_match_len_ofs_ok:
+	ADDQ $0x18, R10
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decode_amd64_main_loop
+	MOVQ s+0(FP), AX
+	MOVQ R11, 144(AX)
+	MOVQ R12, 152(AX)
+	MOVQ R13, 160(AX)
+	MOVQ br+8(FP), AX
+	MOVQ DX, 24(AX)
+	MOVB BL, 32(AX)
+	MOVQ SI, 8(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_amd64_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
+	MOVQ    br+8(FP), CX
+	MOVQ    24(CX), DX
+	MOVBQZX 32(CX), BX
+	MOVQ    (CX), AX
+	MOVQ    8(CX), SI
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    104(AX), R10
+	MOVQ    s+0(FP), AX
+	MOVQ    144(AX), R11
+	MOVQ    152(AX), R12
+	MOVQ    160(AX), R13
+
+sequenceDecs_decode_56_amd64_main_loop:
+	MOVQ (SP), R14
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_56_amd64_fill_end
+
+sequenceDecs_decode_56_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_56_amd64_fill_check_overread
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_56_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_56_amd64_fill_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
+sequenceDecs_decode_56_amd64_fill_end:
+	// Update offset
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_56_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_56_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_56_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_56_amd64_of_update_zero:
+	MOVQ AX, 16(R10)
+
+	// Update match length
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_56_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_56_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_56_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_56_amd64_ml_update_zero:
+	MOVQ AX, 8(R10)
+
+	// Update literal length
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R15
+	SHLQ  CL, R15
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decode_56_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decode_56_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decode_56_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R15
+	ADDQ  R15, AX
+
+sequenceDecs_decode_56_amd64_ll_update_zero:
+	MOVQ AX, (R10)
+
+	// Fill bitreader for state updates
+	MOVQ    R14, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decode_56_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R14
+	SHRL    $0x10, DI
+	LEAQ    (BX)(R14*1), CX
+	MOVQ    DX, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
+	ADDQ    R15, DI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R14
+	SHRL    $0x10, R8
+	LEAQ    (BX)(R14*1), CX
+	MOVQ    DX, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
+	ADDQ    R15, R8
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R14
+	SHRL    $0x10, R9
+	LEAQ    (BX)(R14*1), CX
+	MOVQ    DX, R15
+	MOVQ    CX, BX
+	ROLQ    CL, R15
+	MOVL    $0x00000001, BP
+	MOVB    R14, CL
+	SHLL    CL, BP
+	DECL    BP
+	ANDQ    BP, R15
+	ADDQ    R15, R9
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_56_amd64_skip_update:
+	// Adjust offset
+	MOVQ 16(R10), CX
+	CMPQ AX, $0x01
+	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
+	MOVQ R12, R13
+	MOVQ R11, R12
+	MOVQ CX, R11
+	JMP  sequenceDecs_decode_56_amd64_after_adjust
+
+sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
+	CMPQ (R10), $0x00000000
+	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+	MOVQ  R11, CX
+	JMP   sequenceDecs_decode_56_amd64_after_adjust
+
+sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_56_amd64_adjust_zero
+	JEQ  sequenceDecs_decode_56_amd64_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_56_amd64_adjust_three
+	JMP  sequenceDecs_decode_56_amd64_adjust_two
+
+sequenceDecs_decode_56_amd64_adjust_zero:
+	MOVQ R11, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_one:
+	MOVQ R12, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_two:
+	MOVQ R13, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_three:
+	LEAQ -1(R11), AX
+
+sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
+	MOVQ  $0x00000001, AX
+
+sequenceDecs_decode_56_amd64_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R12, R13
+	MOVQ    R11, R12
+	MOVQ    AX, R11
+	MOVQ    AX, CX
+
+sequenceDecs_decode_56_amd64_after_adjust:
+	MOVQ CX, 16(R10)
+
+	// Check values
+	MOVQ  8(R10), AX
+	MOVQ  (R10), R14
+	LEAQ  (AX)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_amd64_match_len_ofs_ok:
+	ADDQ $0x18, R10
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decode_56_amd64_main_loop
+	MOVQ s+0(FP), AX
+	MOVQ R11, 144(AX)
+	MOVQ R12, 152(AX)
+	MOVQ R13, 160(AX)
+	MOVQ br+8(FP), AX
+	MOVQ DX, 24(AX)
+	MOVB BL, 32(AX)
+	MOVQ SI, 8(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_56_amd64_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
+	MOVQ    br+8(FP), BX
+	MOVQ    24(BX), AX
+	MOVBQZX 32(BX), DX
+	MOVQ    (BX), CX
+	MOVQ    8(BX), BX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    104(CX), R9
+	MOVQ    s+0(FP), CX
+	MOVQ    144(CX), R10
+	MOVQ    152(CX), R11
+	MOVQ    160(CX), R12
+
+sequenceDecs_decode_bmi2_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_bmi2_fill_end
+
+sequenceDecs_decode_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_bmi2_fill_check_overread
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
+sequenceDecs_decode_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 16(R9)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 8(R9)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_bmi2_fill_2_end
+
+sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_bmi2_fill_2_check_overread
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_2_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
+sequenceDecs_decode_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, (R9)
+
+	// Fill bitreader for state updates
+	MOVQ    R13, (SP)
+	MOVQ    $0x00000808, CX
+	BEXTRQ  CX, R8, R13
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decode_bmi2_skip_update
+	LEAQ    (SI)(DI*1), R14
+	ADDQ    R8, R14
+	MOVBQZX R14, R14
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+
+	// Update Offset State
+	BZHIQ R8, R15, CX
+	SHRXQ R8, R15, R15
+	SHRL  $0x10, R8
+	ADDQ  CX, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Match Length State
+	BZHIQ DI, R15, CX
+	SHRXQ DI, R15, R15
+	SHRL  $0x10, DI
+	ADDQ  CX, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Literal Length State
+	BZHIQ SI, R15, CX
+	SHRL  $0x10, SI
+	ADDQ  CX, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decode_bmi2_skip_update:
+	// Adjust offset
+	MOVQ 16(R9), CX
+	CMPQ R13, $0x01
+	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
+	MOVQ R11, R12
+	MOVQ R10, R11
+	MOVQ CX, R10
+	JMP  sequenceDecs_decode_bmi2_after_adjust
+
+sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
+	CMPQ (R9), $0x00000000
+	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_bmi2_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
+	MOVQ  R10, CX
+	JMP   sequenceDecs_decode_bmi2_after_adjust
+
+sequenceDecs_decode_bmi2_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_bmi2_adjust_zero
+	JEQ  sequenceDecs_decode_bmi2_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_bmi2_adjust_three
+	JMP  sequenceDecs_decode_bmi2_adjust_two
+
+sequenceDecs_decode_bmi2_adjust_zero:
+	MOVQ R10, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_one:
+	MOVQ R11, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_two:
+	MOVQ R12, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_three:
+	LEAQ -1(R10), R13
+
+sequenceDecs_decode_bmi2_adjust_test_temp_valid:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
+	MOVQ  $0x00000001, R13
+
+sequenceDecs_decode_bmi2_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R11, R12
+	MOVQ    R10, R11
+	MOVQ    R13, R10
+	MOVQ    R13, CX
+
+sequenceDecs_decode_bmi2_after_adjust:
+	MOVQ CX, 16(R9)
+
+	// Check values
+	MOVQ  8(R9), R13
+	MOVQ  (R9), R14
+	LEAQ  (R13)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  R13, $0x00020002
+	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_bmi2_match_len_ofs_ok:
+	ADDQ $0x18, R9
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decode_bmi2_main_loop
+	MOVQ s+0(FP), CX
+	MOVQ R10, 144(CX)
+	MOVQ R11, 152(CX)
+	MOVQ R12, 160(CX)
+	MOVQ br+8(FP), CX
+	MOVQ AX, 24(CX)
+	MOVB DL, 32(CX)
+	MOVQ BX, 8(CX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_bmi2_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
+	MOVQ    br+8(FP), BX
+	MOVQ    24(BX), AX
+	MOVBQZX 32(BX), DX
+	MOVQ    (BX), CX
+	MOVQ    8(BX), BX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    104(CX), R9
+	MOVQ    s+0(FP), CX
+	MOVQ    144(CX), R10
+	MOVQ    152(CX), R11
+	MOVQ    160(CX), R12
+
+sequenceDecs_decode_56_bmi2_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_56_bmi2_fill_end
+
+sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_56_bmi2_fill_check_overread
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_56_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_56_bmi2_fill_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
+sequenceDecs_decode_56_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 16(R9)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 8(R9)
+
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, (R9)
+
+	// Fill bitreader for state updates
+	MOVQ    R13, (SP)
+	MOVQ    $0x00000808, CX
+	BEXTRQ  CX, R8, R13
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decode_56_bmi2_skip_update
+	LEAQ    (SI)(DI*1), R14
+	ADDQ    R8, R14
+	MOVBQZX R14, R14
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+
+	// Update Offset State
+	BZHIQ R8, R15, CX
+	SHRXQ R8, R15, R15
+	SHRL  $0x10, R8
+	ADDQ  CX, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Match Length State
+	BZHIQ DI, R15, CX
+	SHRXQ DI, R15, R15
+	SHRL  $0x10, DI
+	ADDQ  CX, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Literal Length State
+	BZHIQ SI, R15, CX
+	SHRL  $0x10, SI
+	ADDQ  CX, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decode_56_bmi2_skip_update:
+	// Adjust offset
+	MOVQ 16(R9), CX
+	CMPQ R13, $0x01
+	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
+	MOVQ R11, R12
+	MOVQ R10, R11
+	MOVQ CX, R10
+	JMP  sequenceDecs_decode_56_bmi2_after_adjust
+
+sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
+	CMPQ (R9), $0x00000000
+	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+	MOVQ  R10, CX
+	JMP   sequenceDecs_decode_56_bmi2_after_adjust
+
+sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_56_bmi2_adjust_zero
+	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_56_bmi2_adjust_three
+	JMP  sequenceDecs_decode_56_bmi2_adjust_two
+
+sequenceDecs_decode_56_bmi2_adjust_zero:
+	MOVQ R10, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_one:
+	MOVQ R11, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_two:
+	MOVQ R12, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_three:
+	LEAQ -1(R10), R13
+
+sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
+	MOVQ  $0x00000001, R13
+
+sequenceDecs_decode_56_bmi2_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R11, R12
+	MOVQ    R10, R11
+	MOVQ    R13, R10
+	MOVQ    R13, CX
+
+sequenceDecs_decode_56_bmi2_after_adjust:
+	MOVQ CX, 16(R9)
+
+	// Check values
+	MOVQ  8(R9), R13
+	MOVQ  (R9), R14
+	LEAQ  (R13)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  R13, $0x00020002
+	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
+	ADDQ $0x18, R9
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decode_56_bmi2_main_loop
+	MOVQ s+0(FP), CX
+	MOVQ R10, 144(CX)
+	MOVQ R11, 152(CX)
+	MOVQ R12, 160(CX)
+	MOVQ br+8(FP), CX
+	MOVQ AX, 24(CX)
+	MOVB DL, 32(CX)
+	MOVQ BX, 8(CX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_56_bmi2_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
+// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+// Requires: SSE
+TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
+	MOVQ  ctx+0(FP), R10
+	MOVQ  8(R10), CX
+	TESTQ CX, CX
+	JZ    empty_seqs
+	MOVQ  (R10), AX
+	MOVQ  24(R10), DX
+	MOVQ  32(R10), BX
+	MOVQ  80(R10), SI
+	MOVQ  104(R10), DI
+	MOVQ  120(R10), R8
+	MOVQ  56(R10), R9
+	MOVQ  64(R10), R10
+	ADDQ  R10, R9
+
+	// seqsBase += 24 * seqIndex
+	LEAQ (DX)(DX*2), R11
+	SHLQ $0x03, R11
+	ADDQ R11, AX
+
+	// outBase += outPosition
+	ADDQ DI, BX
+
+main_loop:
+	MOVQ (AX), R11
+	MOVQ 16(AX), R12
+	MOVQ 8(AX), R13
+
+	// Copy literals
+	TESTQ R11, R11
+	JZ    check_offset
+	XORQ  R14, R14
+
+copy_1:
+	MOVUPS (SI)(R14*1), X0
+	MOVUPS X0, (BX)(R14*1)
+	ADDQ   $0x10, R14
+	CMPQ   R14, R11
+	JB     copy_1
+	ADDQ   R11, SI
+	ADDQ   R11, BX
+	ADDQ   R11, DI
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	LEAQ (DI)(R10*1), R11
+	CMPQ R12, R11
+	JG   error_match_off_too_big
+	CMPQ R12, R8
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ R12, R11
+	SUBQ DI, R11
+	JLS  copy_match
+	MOVQ R9, R14
+	SUBQ R11, R14
+	CMPQ R13, R11
+	JG   copy_all_from_history
+	MOVQ R13, R11
+	SUBQ $0x10, R11
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R11
+	JAE    copy_4_loop
+	LEAQ   16(R14)(R11*1), R14
+	LEAQ   16(BX)(R11*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), R11
+	MOVB 2(R14), R12
+	MOVW R11, (BX)
+	MOVB R12, 2(BX)
+	ADDQ R13, R14
+	ADDQ R13, BX
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), R11
+	MOVL -4(R14)(R13*1), R12
+	MOVL R11, (BX)
+	MOVL R12, -4(BX)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, BX
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), R11
+	MOVQ -8(R14)(R13*1), R12
+	MOVQ R11, (BX)
+	MOVQ R12, -8(BX)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, BX
+
+copy_4_end:
+	ADDQ R13, DI
+	ADDQ $0x18, AX
+	INCQ DX
+	CMPQ DX, CX
+	JB   main_loop
+	JMP  loop_finished
+
+copy_all_from_history:
+	MOVQ R11, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(BX)(R15*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ R11, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ R11, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(R11*1), BP
+	MOVB R15, (BX)
+	MOVB BP, -1(BX)(R11*1)
+	ADDQ R11, R14
+	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (BX)
+	MOVB BP, 2(BX)
+	ADDQ R11, R14
+	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(R11*1), BP
+	MOVL R15, (BX)
+	MOVL BP, -4(BX)(R11*1)
+	ADDQ R11, R14
+	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(R11*1), BP
+	MOVQ R15, (BX)
+	MOVQ BP, -8(BX)(R11*1)
+	ADDQ R11, R14
+	ADDQ R11, BX
+
+copy_5_end:
+	ADDQ R11, DI
+	SUBQ R11, R13
+
+	// Copy match from the current buffer
+copy_match:
+	MOVQ BX, R11
+	SUBQ R12, R11
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, DI
+	MOVQ BX, R12
+	ADDQ R13, BX
+
+copy_2:
+	MOVUPS (R11), X0
+	MOVUPS X0, (R12)
+	ADDQ   $0x10, R11
+	ADDQ   $0x10, R12
+	SUBQ   $0x10, R13
+	JHI    copy_2
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, DI
+
+copy_slow_3:
+	MOVB (R11), R12
+	MOVB R12, (BX)
+	INCQ R11
+	INCQ BX
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	ADDQ $0x18, AX
+	INCQ DX
+	CMPQ DX, CX
+	JB   main_loop
+
+loop_finished:
+	// Return value
+	MOVB $0x01, ret+8(FP)
+
+	// Update the context
+	MOVQ ctx+0(FP), AX
+	MOVQ DX, 24(AX)
+	MOVQ DI, 104(AX)
+	SUBQ 80(AX), SI
+	MOVQ SI, 112(AX)
+	RET
+
+error_match_off_too_big:
+	// Return value
+	MOVB $0x00, ret+8(FP)
+
+	// Update the context
+	MOVQ ctx+0(FP), AX
+	MOVQ DX, 24(AX)
+	MOVQ DI, 104(AX)
+	SUBQ 80(AX), SI
+	MOVQ SI, 112(AX)
+	RET
+
+empty_seqs:
+	// Return value
+	MOVB $0x01, ret+8(FP)
+	RET
+
+// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
+// Requires: SSE
+TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
+	MOVQ  ctx+0(FP), R10
+	MOVQ  8(R10), CX
+	TESTQ CX, CX
+	JZ    empty_seqs
+	MOVQ  (R10), AX
+	MOVQ  24(R10), DX
+	MOVQ  32(R10), BX
+	MOVQ  80(R10), SI
+	MOVQ  104(R10), DI
+	MOVQ  120(R10), R8
+	MOVQ  56(R10), R9
+	MOVQ  64(R10), R10
+	ADDQ  R10, R9
+
+	// seqsBase += 24 * seqIndex
+	LEAQ (DX)(DX*2), R11
+	SHLQ $0x03, R11
+	ADDQ R11, AX
+
+	// outBase += outPosition
+	ADDQ DI, BX
+
+main_loop:
+	MOVQ (AX), R11
+	MOVQ 16(AX), R12
+	MOVQ 8(AX), R13
+
+	// Copy literals
+	TESTQ R11, R11
+	JZ    check_offset
+	MOVQ  R11, R14
+	SUBQ  $0x10, R14
+	JB    copy_1_small
+
+copy_1_loop:
+	MOVUPS (SI), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, SI
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R14
+	JAE    copy_1_loop
+	LEAQ   16(SI)(R14*1), SI
+	LEAQ   16(BX)(R14*1), BX
+	MOVUPS -16(SI), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_1_end
+
+copy_1_small:
+	CMPQ R11, $0x03
+	JE   copy_1_move_3
+	JB   copy_1_move_1or2
+	CMPQ R11, $0x08
+	JB   copy_1_move_4through7
+	JMP  copy_1_move_8through16
+
+copy_1_move_1or2:
+	MOVB (SI), R14
+	MOVB -1(SI)(R11*1), R15
+	MOVB R14, (BX)
+	MOVB R15, -1(BX)(R11*1)
+	ADDQ R11, SI
+	ADDQ R11, BX
+	JMP  copy_1_end
+
+copy_1_move_3:
+	MOVW (SI), R14
+	MOVB 2(SI), R15
+	MOVW R14, (BX)
+	MOVB R15, 2(BX)
+	ADDQ R11, SI
+	ADDQ R11, BX
+	JMP  copy_1_end
+
+copy_1_move_4through7:
+	MOVL (SI), R14
+	MOVL -4(SI)(R11*1), R15
+	MOVL R14, (BX)
+	MOVL R15, -4(BX)(R11*1)
+	ADDQ R11, SI
+	ADDQ R11, BX
+	JMP  copy_1_end
+
+copy_1_move_8through16:
+	MOVQ (SI), R14
+	MOVQ -8(SI)(R11*1), R15
+	MOVQ R14, (BX)
+	MOVQ R15, -8(BX)(R11*1)
+	ADDQ R11, SI
+	ADDQ R11, BX
+
+copy_1_end:
+	ADDQ R11, DI
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	LEAQ (DI)(R10*1), R11
+	CMPQ R12, R11
+	JG   error_match_off_too_big
+	CMPQ R12, R8
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ R12, R11
+	SUBQ DI, R11
+	JLS  copy_match
+	MOVQ R9, R14
+	SUBQ R11, R14
+	CMPQ R13, R11
+	JG   copy_all_from_history
+	MOVQ R13, R11
+	SUBQ $0x10, R11
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R11
+	JAE    copy_4_loop
+	LEAQ   16(R14)(R11*1), R14
+	LEAQ   16(BX)(R11*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), R11
+	MOVB 2(R14), R12
+	MOVW R11, (BX)
+	MOVB R12, 2(BX)
+	ADDQ R13, R14
+	ADDQ R13, BX
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), R11
+	MOVL -4(R14)(R13*1), R12
+	MOVL R11, (BX)
+	MOVL R12, -4(BX)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, BX
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), R11
+	MOVQ -8(R14)(R13*1), R12
+	MOVQ R11, (BX)
+	MOVQ R12, -8(BX)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, BX
+
+copy_4_end:
+	ADDQ R13, DI
+	ADDQ $0x18, AX
+	INCQ DX
+	CMPQ DX, CX
+	JB   main_loop
+	JMP  loop_finished
+
+copy_all_from_history:
+	MOVQ R11, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(BX)(R15*1), BX
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ R11, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ R11, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(R11*1), BP
+	MOVB R15, (BX)
+	MOVB BP, -1(BX)(R11*1)
+	ADDQ R11, R14
+	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (BX)
+	MOVB BP, 2(BX)
+	ADDQ R11, R14
+	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(R11*1), BP
+	MOVL R15, (BX)
+	MOVL BP, -4(BX)(R11*1)
+	ADDQ R11, R14
+	ADDQ R11, BX
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(R11*1), BP
+	MOVQ R15, (BX)
+	MOVQ BP, -8(BX)(R11*1)
+	ADDQ R11, R14
+	ADDQ R11, BX
+
+copy_5_end:
+	ADDQ R11, DI
+	SUBQ R11, R13
+
+	// Copy match from the current buffer
+copy_match:
+	MOVQ BX, R11
+	SUBQ R12, R11
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, DI
+	MOVQ R13, R12
+	SUBQ $0x10, R12
+	JB   copy_2_small
+
+copy_2_loop:
+	MOVUPS (R11), X0
+	MOVUPS X0, (BX)
+	ADDQ   $0x10, R11
+	ADDQ   $0x10, BX
+	SUBQ   $0x10, R12
+	JAE    copy_2_loop
+	LEAQ   16(R11)(R12*1), R11
+	LEAQ   16(BX)(R12*1), BX
+	MOVUPS -16(R11), X0
+	MOVUPS X0, -16(BX)
+	JMP    copy_2_end
+
+copy_2_small:
+	CMPQ R13, $0x03
+	JE   copy_2_move_3
+	JB   copy_2_move_1or2
+	CMPQ R13, $0x08
+	JB   copy_2_move_4through7
+	JMP  copy_2_move_8through16
+
+copy_2_move_1or2:
+	MOVB (R11), R12
+	MOVB -1(R11)(R13*1), R14
+	MOVB R12, (BX)
+	MOVB R14, -1(BX)(R13*1)
+	ADDQ R13, R11
+	ADDQ R13, BX
+	JMP  copy_2_end
+
+copy_2_move_3:
+	MOVW (R11), R12
+	MOVB 2(R11), R14
+	MOVW R12, (BX)
+	MOVB R14, 2(BX)
+	ADDQ R13, R11
+	ADDQ R13, BX
+	JMP  copy_2_end
+
+copy_2_move_4through7:
+	MOVL (R11), R12
+	MOVL -4(R11)(R13*1), R14
+	MOVL R12, (BX)
+	MOVL R14, -4(BX)(R13*1)
+	ADDQ R13, R11
+	ADDQ R13, BX
+	JMP  copy_2_end
+
+copy_2_move_8through16:
+	MOVQ (R11), R12
+	MOVQ -8(R11)(R13*1), R14
+	MOVQ R12, (BX)
+	MOVQ R14, -8(BX)(R13*1)
+	ADDQ R13, R11
+	ADDQ R13, BX
+
+copy_2_end:
+	JMP handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, DI
+
+copy_slow_3:
+	MOVB (R11), R12
+	MOVB R12, (BX)
+	INCQ R11
+	INCQ BX
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	ADDQ $0x18, AX
+	INCQ DX
+	CMPQ DX, CX
+	JB   main_loop
+
+loop_finished:
+	// Return value
+	MOVB $0x01, ret+8(FP)
+
+	// Update the context
+	MOVQ ctx+0(FP), AX
+	MOVQ DX, 24(AX)
+	MOVQ DI, 104(AX)
+	SUBQ 80(AX), SI
+	MOVQ SI, 112(AX)
+	RET
+
+error_match_off_too_big:
+	// Return value
+	MOVB $0x00, ret+8(FP)
+
+	// Update the context
+	MOVQ ctx+0(FP), AX
+	MOVQ DX, 24(AX)
+	MOVQ DI, 104(AX)
+	SUBQ 80(AX), SI
+	MOVQ SI, 112(AX)
+	RET
+
+empty_seqs:
+	// Return value
+	MOVB $0x01, ret+8(FP)
+	RET
+
+// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
+	MOVQ    br+8(FP), CX
+	MOVQ    24(CX), DX
+	MOVBQZX 32(CX), BX
+	MOVQ    (CX), AX
+	MOVQ    8(CX), SI
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	XORQ    CX, CX
+	MOVQ    CX, 8(SP)
+	MOVQ    CX, 16(SP)
+	MOVQ    CX, 24(SP)
+	MOVQ    112(AX), R10
+	MOVQ    128(AX), CX
+	MOVQ    CX, 32(SP)
+	MOVQ    144(AX), R11
+	MOVQ    136(AX), R12
+	MOVQ    200(AX), CX
+	MOVQ    CX, 56(SP)
+	MOVQ    176(AX), CX
+	MOVQ    CX, 48(SP)
+	MOVQ    184(AX), AX
+	MOVQ    AX, 40(SP)
+	MOVQ    40(SP), AX
+	ADDQ    AX, 48(SP)
+
+	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R10, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R12, R10
+
+sequenceDecs_decodeSync_amd64_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_amd64_fill_end
+
+sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_amd64_fill_check_overread
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
+sequenceDecs_decodeSync_amd64_fill_end:
+	// Update offset
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_amd64_of_update_zero:
+	MOVQ AX, 8(SP)
+
+	// Update match length
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_amd64_ml_update_zero:
+	MOVQ AX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
+
+sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_amd64_fill_2_check_overread
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_2_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
+sequenceDecs_decodeSync_amd64_fill_2_end:
+	// Update literal length
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_amd64_ll_update_zero:
+	MOVQ AX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ    R13, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decodeSync_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R13
+	SHRL    $0x10, DI
+	LEAQ    (BX)(R13*1), CX
+	MOVQ    DX, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
+	ADDQ    R14, DI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R13
+	SHRL    $0x10, R8
+	LEAQ    (BX)(R13*1), CX
+	MOVQ    DX, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
+	ADDQ    R14, R8
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R13
+	SHRL    $0x10, R9
+	LEAQ    (BX)(R13*1), CX
+	MOVQ    DX, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
+	ADDQ    R14, R9
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_amd64_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   AX, $0x01
+	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_amd64_after_adjust
+
+sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_amd64_after_adjust
+
+sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
+	MOVQ    R13, AX
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, AX
+	CMOVQEQ R15, R14
+	ADDQ    144(CX)(AX*8), R14
+	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_amd64_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
+	MOVQ 152(CX), AX
+	MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_amd64_adjust_skip:
+	MOVQ 144(CX), AX
+	MOVQ AX, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_amd64_after_adjust:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), AX
+	MOVQ  24(SP), CX
+	LEAQ  (AX)(CX*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  CX, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
+	MOVQ 24(SP), AX
+	MOVQ 8(SP), CX
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (AX)(R13*1), R14
+	ADDQ R10, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ AX, AX
+	JZ    check_offset
+	XORQ  R14, R14
+
+copy_1:
+	MOVUPS (R11)(R14*1), X0
+	MOVUPS X0, (R10)(R14*1)
+	ADDQ   $0x10, R14
+	CMPQ   R14, AX
+	JB     copy_1
+	ADDQ   AX, R11
+	ADDQ   AX, R10
+	ADDQ   AX, R12
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R12, AX
+	ADDQ 40(SP), AX
+	CMPQ CX, AX
+	JG   error_match_off_too_big
+	CMPQ CX, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ CX, AX
+	SUBQ R12, AX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ AX, R14
+	CMPQ R13, AX
+	JG   copy_all_from_history
+	MOVQ R13, AX
+	SUBQ $0x10, AX
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, AX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(AX*1), R14
+	LEAQ   16(R10)(AX*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), AX
+	MOVB 2(R14), CL
+	MOVW AX, (R10)
+	MOVB CL, 2(R10)
+	ADDQ R13, R14
+	ADDQ R13, R10
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), AX
+	MOVL -4(R14)(R13*1), CX
+	MOVL AX, (R10)
+	MOVL CX, -4(R10)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R10
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), AX
+	MOVQ -8(R14)(R13*1), CX
+	MOVQ AX, (R10)
+	MOVQ CX, -8(R10)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R10
+
+copy_4_end:
+	ADDQ R13, R12
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	MOVQ AX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R10)(R15*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ AX, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ AX, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(AX*1), BP
+	MOVB R15, (R10)
+	MOVB BP, -1(R10)(AX*1)
+	ADDQ AX, R14
+	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (R10)
+	MOVB BP, 2(R10)
+	ADDQ AX, R14
+	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(AX*1), BP
+	MOVL R15, (R10)
+	MOVL BP, -4(R10)(AX*1)
+	ADDQ AX, R14
+	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(AX*1), BP
+	MOVQ R15, (R10)
+	MOVQ BP, -8(R10)(AX*1)
+	ADDQ AX, R14
+	ADDQ AX, R10
+
+copy_5_end:
+	ADDQ AX, R12
+	SUBQ AX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	MOVQ R10, AX
+	SUBQ CX, AX
+
+	// ml <= mo
+	CMPQ R13, CX
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, R12
+	MOVQ R10, CX
+	ADDQ R13, R10
+
+copy_2:
+	MOVUPS (AX), X0
+	MOVUPS X0, (CX)
+	ADDQ   $0x10, AX
+	ADDQ   $0x10, CX
+	SUBQ   $0x10, R13
+	JHI    copy_2
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R12
+
+copy_slow_3:
+	MOVB (AX), CL
+	MOVB CL, (R10)
+	INCQ AX
+	INCQ R10
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decodeSync_amd64_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), AX
+	MOVQ DX, 24(AX)
+	MOVB BL, 32(AX)
+	MOVQ SI, 8(AX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R12, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R11
+	MOVQ R11, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_amd64_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
+	MOVQ    br+8(FP), BX
+	MOVQ    24(BX), AX
+	MOVBQZX 32(BX), DX
+	MOVQ    (BX), CX
+	MOVQ    8(BX), BX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	XORQ    R9, R9
+	MOVQ    R9, 8(SP)
+	MOVQ    R9, 16(SP)
+	MOVQ    R9, 24(SP)
+	MOVQ    112(CX), R9
+	MOVQ    128(CX), R10
+	MOVQ    R10, 32(SP)
+	MOVQ    144(CX), R10
+	MOVQ    136(CX), R11
+	MOVQ    200(CX), R12
+	MOVQ    R12, 56(SP)
+	MOVQ    176(CX), R12
+	MOVQ    R12, 48(SP)
+	MOVQ    184(CX), CX
+	MOVQ    CX, 40(SP)
+	MOVQ    40(SP), CX
+	ADDQ    CX, 48(SP)
+
+	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R9, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R11, R9
+
+sequenceDecs_decodeSync_bmi2_main_loop:
+	MOVQ (SP), R12
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_bmi2_fill_end
+
+sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_bmi2_fill_check_overread
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
+sequenceDecs_decodeSync_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 8(SP)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_check_overread
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
+sequenceDecs_decodeSync_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ    R12, (SP)
+	MOVQ    $0x00000808, CX
+	BEXTRQ  CX, R8, R12
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decodeSync_bmi2_skip_update
+	LEAQ    (SI)(DI*1), R13
+	ADDQ    R8, R13
+	MOVBQZX R13, R13
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+
+	// Update Offset State
+	BZHIQ R8, R14, CX
+	SHRXQ R8, R14, R14
+	SHRL  $0x10, R8
+	ADDQ  CX, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Match Length State
+	BZHIQ DI, R14, CX
+	SHRXQ DI, R14, R14
+	SHRL  $0x10, DI
+	ADDQ  CX, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Literal Length State
+	BZHIQ SI, R14, CX
+	SHRL  $0x10, SI
+	ADDQ  CX, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decodeSync_bmi2_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   R12, $0x01
+	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_bmi2_after_adjust
+
+sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_bmi2_after_adjust
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
+	MOVQ    R13, R12
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, R12
+	CMOVQEQ R15, R14
+	ADDQ    144(CX)(R12*8), R14
+	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
+	MOVQ 152(CX), R12
+	MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_bmi2_adjust_skip:
+	MOVQ 144(CX), R12
+	MOVQ R12, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_bmi2_after_adjust:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), CX
+	MOVQ  24(SP), R12
+	LEAQ  (CX)(R12*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  R12, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  CX, $0x00020002
+	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
+	MOVQ 24(SP), CX
+	MOVQ 8(SP), R12
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (CX)(R13*1), R14
+	ADDQ R9, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ CX, CX
+	JZ    check_offset
+	XORQ  R14, R14
+
+copy_1:
+	MOVUPS (R10)(R14*1), X0
+	MOVUPS X0, (R9)(R14*1)
+	ADDQ   $0x10, R14
+	CMPQ   R14, CX
+	JB     copy_1
+	ADDQ   CX, R10
+	ADDQ   CX, R9
+	ADDQ   CX, R11
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R11, CX
+	ADDQ 40(SP), CX
+	CMPQ R12, CX
+	JG   error_match_off_too_big
+	CMPQ R12, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ R12, CX
+	SUBQ R11, CX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ CX, R14
+	CMPQ R13, CX
+	JG   copy_all_from_history
+	MOVQ R13, CX
+	SUBQ $0x10, CX
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, CX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(CX*1), R14
+	LEAQ   16(R9)(CX*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), CX
+	MOVB 2(R14), R12
+	MOVW CX, (R9)
+	MOVB R12, 2(R9)
+	ADDQ R13, R14
+	ADDQ R13, R9
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), CX
+	MOVL -4(R14)(R13*1), R12
+	MOVL CX, (R9)
+	MOVL R12, -4(R9)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R9
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), CX
+	MOVQ -8(R14)(R13*1), R12
+	MOVQ CX, (R9)
+	MOVQ R12, -8(R9)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R9
+
+copy_4_end:
+	ADDQ R13, R11
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	MOVQ CX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R9)(R15*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ CX, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ CX, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(CX*1), BP
+	MOVB R15, (R9)
+	MOVB BP, -1(R9)(CX*1)
+	ADDQ CX, R14
+	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (R9)
+	MOVB BP, 2(R9)
+	ADDQ CX, R14
+	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(CX*1), BP
+	MOVL R15, (R9)
+	MOVL BP, -4(R9)(CX*1)
+	ADDQ CX, R14
+	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(CX*1), BP
+	MOVQ R15, (R9)
+	MOVQ BP, -8(R9)(CX*1)
+	ADDQ CX, R14
+	ADDQ CX, R9
+
+copy_5_end:
+	ADDQ CX, R11
+	SUBQ CX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	MOVQ R9, CX
+	SUBQ R12, CX
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, R11
+	MOVQ R9, R12
+	ADDQ R13, R9
+
+copy_2:
+	MOVUPS (CX), X0
+	MOVUPS X0, (R12)
+	ADDQ   $0x10, CX
+	ADDQ   $0x10, R12
+	SUBQ   $0x10, R13
+	JHI    copy_2
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R11
+
+copy_slow_3:
+	MOVB (CX), R12
+	MOVB R12, (R9)
+	INCQ CX
+	INCQ R9
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decodeSync_bmi2_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), CX
+	MOVQ AX, 24(CX)
+	MOVB DL, 32(CX)
+	MOVQ BX, 8(CX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R11, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R10
+	MOVQ R10, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
+	MOVQ    br+8(FP), CX
+	MOVQ    24(CX), DX
+	MOVBQZX 32(CX), BX
+	MOVQ    (CX), AX
+	MOVQ    8(CX), SI
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	XORQ    CX, CX
+	MOVQ    CX, 8(SP)
+	MOVQ    CX, 16(SP)
+	MOVQ    CX, 24(SP)
+	MOVQ    112(AX), R10
+	MOVQ    128(AX), CX
+	MOVQ    CX, 32(SP)
+	MOVQ    144(AX), R11
+	MOVQ    136(AX), R12
+	MOVQ    200(AX), CX
+	MOVQ    CX, 56(SP)
+	MOVQ    176(AX), CX
+	MOVQ    CX, 48(SP)
+	MOVQ    184(AX), AX
+	MOVQ    AX, 40(SP)
+	MOVQ    40(SP), AX
+	ADDQ    AX, 48(SP)
+
+	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R10, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R12, R10
+
+sequenceDecs_decodeSync_safe_amd64_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_check_overread
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
+sequenceDecs_decodeSync_safe_amd64_fill_end:
+	// Update offset
+	MOVQ  R9, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_safe_amd64_of_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_safe_amd64_of_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_safe_amd64_of_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_of_update_zero:
+	MOVQ AX, 8(SP)
+
+	// Update match length
+	MOVQ  R8, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_safe_amd64_ml_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
+	MOVQ AX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_end:
+	// Update literal length
+	MOVQ  DI, AX
+	MOVQ  BX, CX
+	MOVQ  DX, R14
+	SHLQ  CL, R14
+	MOVB  AH, CL
+	SHRQ  $0x20, AX
+	TESTQ CX, CX
+	JZ    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+	ADDQ  CX, BX
+	CMPQ  BX, $0x40
+	JA    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+	CMPQ  CX, $0x40
+	JAE   sequenceDecs_decodeSync_safe_amd64_ll_update_zero
+	NEGQ  CX
+	SHRQ  CL, R14
+	ADDQ  R14, AX
+
+sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
+	MOVQ AX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ    R13, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R13
+	SHRL    $0x10, DI
+	LEAQ    (BX)(R13*1), CX
+	MOVQ    DX, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
+	ADDQ    R14, DI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R13
+	SHRL    $0x10, R8
+	LEAQ    (BX)(R13*1), CX
+	MOVQ    DX, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
+	ADDQ    R14, R8
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R13
+	SHRL    $0x10, R9
+	LEAQ    (BX)(R13*1), CX
+	MOVQ    DX, R14
+	MOVQ    CX, BX
+	ROLQ    CL, R14
+	MOVL    $0x00000001, R15
+	MOVB    R13, CL
+	SHLL    CL, R15
+	DECL    R15
+	ANDQ    R15, R14
+	ADDQ    R14, R9
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_safe_amd64_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   AX, $0x01
+	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_safe_amd64_after_adjust
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_safe_amd64_after_adjust
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
+	MOVQ    R13, AX
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, AX
+	CMOVQEQ R15, R14
+	ADDQ    144(CX)(AX*8), R14
+	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
+	MOVQ 152(CX), AX
+	MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_safe_amd64_adjust_skip:
+	MOVQ 144(CX), AX
+	MOVQ AX, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_amd64_after_adjust:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), AX
+	MOVQ  24(SP), CX
+	LEAQ  (AX)(CX*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  CX, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
+	MOVQ 24(SP), AX
+	MOVQ 8(SP), CX
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (AX)(R13*1), R14
+	ADDQ R10, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ AX, AX
+	JZ    check_offset
+	MOVQ  AX, R14
+	SUBQ  $0x10, R14
+	JB    copy_1_small
+
+copy_1_loop:
+	MOVUPS (R11), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R11
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, R14
+	JAE    copy_1_loop
+	LEAQ   16(R11)(R14*1), R11
+	LEAQ   16(R10)(R14*1), R10
+	MOVUPS -16(R11), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_1_end
+
+copy_1_small:
+	CMPQ AX, $0x03
+	JE   copy_1_move_3
+	JB   copy_1_move_1or2
+	CMPQ AX, $0x08
+	JB   copy_1_move_4through7
+	JMP  copy_1_move_8through16
+
+copy_1_move_1or2:
+	MOVB (R11), R14
+	MOVB -1(R11)(AX*1), R15
+	MOVB R14, (R10)
+	MOVB R15, -1(R10)(AX*1)
+	ADDQ AX, R11
+	ADDQ AX, R10
+	JMP  copy_1_end
+
+copy_1_move_3:
+	MOVW (R11), R14
+	MOVB 2(R11), R15
+	MOVW R14, (R10)
+	MOVB R15, 2(R10)
+	ADDQ AX, R11
+	ADDQ AX, R10
+	JMP  copy_1_end
+
+copy_1_move_4through7:
+	MOVL (R11), R14
+	MOVL -4(R11)(AX*1), R15
+	MOVL R14, (R10)
+	MOVL R15, -4(R10)(AX*1)
+	ADDQ AX, R11
+	ADDQ AX, R10
+	JMP  copy_1_end
+
+copy_1_move_8through16:
+	MOVQ (R11), R14
+	MOVQ -8(R11)(AX*1), R15
+	MOVQ R14, (R10)
+	MOVQ R15, -8(R10)(AX*1)
+	ADDQ AX, R11
+	ADDQ AX, R10
+
+copy_1_end:
+	ADDQ AX, R12
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R12, AX
+	ADDQ 40(SP), AX
+	CMPQ CX, AX
+	JG   error_match_off_too_big
+	CMPQ CX, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ CX, AX
+	SUBQ R12, AX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ AX, R14
+	CMPQ R13, AX
+	JG   copy_all_from_history
+	MOVQ R13, AX
+	SUBQ $0x10, AX
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, AX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(AX*1), R14
+	LEAQ   16(R10)(AX*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), AX
+	MOVB 2(R14), CL
+	MOVW AX, (R10)
+	MOVB CL, 2(R10)
+	ADDQ R13, R14
+	ADDQ R13, R10
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), AX
+	MOVL -4(R14)(R13*1), CX
+	MOVL AX, (R10)
+	MOVL CX, -4(R10)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R10
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), AX
+	MOVQ -8(R14)(R13*1), CX
+	MOVQ AX, (R10)
+	MOVQ CX, -8(R10)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R10
+
+copy_4_end:
+	ADDQ R13, R12
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	MOVQ AX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R10)(R15*1), R10
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ AX, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ AX, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(AX*1), BP
+	MOVB R15, (R10)
+	MOVB BP, -1(R10)(AX*1)
+	ADDQ AX, R14
+	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (R10)
+	MOVB BP, 2(R10)
+	ADDQ AX, R14
+	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(AX*1), BP
+	MOVL R15, (R10)
+	MOVL BP, -4(R10)(AX*1)
+	ADDQ AX, R14
+	ADDQ AX, R10
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(AX*1), BP
+	MOVQ R15, (R10)
+	MOVQ BP, -8(R10)(AX*1)
+	ADDQ AX, R14
+	ADDQ AX, R10
+
+copy_5_end:
+	ADDQ AX, R12
+	SUBQ AX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	MOVQ R10, AX
+	SUBQ CX, AX
+
+	// ml <= mo
+	CMPQ R13, CX
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, R12
+	MOVQ R13, CX
+	SUBQ $0x10, CX
+	JB   copy_2_small
+
+copy_2_loop:
+	MOVUPS (AX), X0
+	MOVUPS X0, (R10)
+	ADDQ   $0x10, AX
+	ADDQ   $0x10, R10
+	SUBQ   $0x10, CX
+	JAE    copy_2_loop
+	LEAQ   16(AX)(CX*1), AX
+	LEAQ   16(R10)(CX*1), R10
+	MOVUPS -16(AX), X0
+	MOVUPS X0, -16(R10)
+	JMP    copy_2_end
+
+copy_2_small:
+	CMPQ R13, $0x03
+	JE   copy_2_move_3
+	JB   copy_2_move_1or2
+	CMPQ R13, $0x08
+	JB   copy_2_move_4through7
+	JMP  copy_2_move_8through16
+
+copy_2_move_1or2:
+	MOVB (AX), CL
+	MOVB -1(AX)(R13*1), R14
+	MOVB CL, (R10)
+	MOVB R14, -1(R10)(R13*1)
+	ADDQ R13, AX
+	ADDQ R13, R10
+	JMP  copy_2_end
+
+copy_2_move_3:
+	MOVW (AX), CX
+	MOVB 2(AX), R14
+	MOVW CX, (R10)
+	MOVB R14, 2(R10)
+	ADDQ R13, AX
+	ADDQ R13, R10
+	JMP  copy_2_end
+
+copy_2_move_4through7:
+	MOVL (AX), CX
+	MOVL -4(AX)(R13*1), R14
+	MOVL CX, (R10)
+	MOVL R14, -4(R10)(R13*1)
+	ADDQ R13, AX
+	ADDQ R13, R10
+	JMP  copy_2_end
+
+copy_2_move_8through16:
+	MOVQ (AX), CX
+	MOVQ -8(AX)(R13*1), R14
+	MOVQ CX, (R10)
+	MOVQ R14, -8(R10)(R13*1)
+	ADDQ R13, AX
+	ADDQ R13, R10
+
+copy_2_end:
+	JMP handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R12
+
+copy_slow_3:
+	MOVB (AX), CL
+	MOVB CL, (R10)
+	INCQ AX
+	INCQ R10
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), AX
+	MOVQ DX, 24(AX)
+	MOVB BL, 32(AX)
+	MOVQ SI, 8(AX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R12, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R11
+	MOVQ R11, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
+	MOVQ    br+8(FP), BX
+	MOVQ    24(BX), AX
+	MOVBQZX 32(BX), DX
+	MOVQ    (BX), CX
+	MOVQ    8(BX), BX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	XORQ    R9, R9
+	MOVQ    R9, 8(SP)
+	MOVQ    R9, 16(SP)
+	MOVQ    R9, 24(SP)
+	MOVQ    112(CX), R9
+	MOVQ    128(CX), R10
+	MOVQ    R10, 32(SP)
+	MOVQ    144(CX), R10
+	MOVQ    136(CX), R11
+	MOVQ    200(CX), R12
+	MOVQ    R12, 56(SP)
+	MOVQ    176(CX), R12
+	MOVQ    R12, 48(SP)
+	MOVQ    184(CX), CX
+	MOVQ    CX, 40(SP)
+	MOVQ    40(SP), CX
+	ADDQ    CX, 48(SP)
+
+	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R9, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R11, R9
+
+sequenceDecs_decodeSync_safe_bmi2_main_loop:
+	MOVQ (SP), R12
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
+sequenceDecs_decodeSync_safe_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 8(SP)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ    R12, (SP)
+	MOVQ    $0x00000808, CX
+	BEXTRQ  CX, R8, R12
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decodeSync_safe_bmi2_skip_update
+	LEAQ    (SI)(DI*1), R13
+	ADDQ    R8, R13
+	MOVBQZX R13, R13
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+
+	// Update Offset State
+	BZHIQ R8, R14, CX
+	SHRXQ R8, R14, R14
+	SHRL  $0x10, R8
+	ADDQ  CX, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Match Length State
+	BZHIQ DI, R14, CX
+	SHRXQ DI, R14, R14
+	SHRL  $0x10, DI
+	ADDQ  CX, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Literal Length State
+	BZHIQ SI, R14, CX
+	SHRL  $0x10, SI
+	ADDQ  CX, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+sequenceDecs_decodeSync_safe_bmi2_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   R12, $0x01
+	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_safe_bmi2_after_adjust
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_safe_bmi2_after_adjust
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
+	MOVQ    R13, R12
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, R12
+	CMOVQEQ R15, R14
+	ADDQ    144(CX)(R12*8), R14
+	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
+	MOVQ 152(CX), R12
+	MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
+	MOVQ 144(CX), R12
+	MOVQ R12, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_bmi2_after_adjust:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), CX
+	MOVQ  24(SP), R12
+	LEAQ  (CX)(R12*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  R12, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  CX, $0x00020002
+	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
+	MOVQ 24(SP), CX
+	MOVQ 8(SP), R12
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (CX)(R13*1), R14
+	ADDQ R9, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ CX, CX
+	JZ    check_offset
+	MOVQ  CX, R14
+	SUBQ  $0x10, R14
+	JB    copy_1_small
+
+copy_1_loop:
+	MOVUPS (R10), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R10
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R14
+	JAE    copy_1_loop
+	LEAQ   16(R10)(R14*1), R10
+	LEAQ   16(R9)(R14*1), R9
+	MOVUPS -16(R10), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_1_end
+
+copy_1_small:
+	CMPQ CX, $0x03
+	JE   copy_1_move_3
+	JB   copy_1_move_1or2
+	CMPQ CX, $0x08
+	JB   copy_1_move_4through7
+	JMP  copy_1_move_8through16
+
+copy_1_move_1or2:
+	MOVB (R10), R14
+	MOVB -1(R10)(CX*1), R15
+	MOVB R14, (R9)
+	MOVB R15, -1(R9)(CX*1)
+	ADDQ CX, R10
+	ADDQ CX, R9
+	JMP  copy_1_end
+
+copy_1_move_3:
+	MOVW (R10), R14
+	MOVB 2(R10), R15
+	MOVW R14, (R9)
+	MOVB R15, 2(R9)
+	ADDQ CX, R10
+	ADDQ CX, R9
+	JMP  copy_1_end
+
+copy_1_move_4through7:
+	MOVL (R10), R14
+	MOVL -4(R10)(CX*1), R15
+	MOVL R14, (R9)
+	MOVL R15, -4(R9)(CX*1)
+	ADDQ CX, R10
+	ADDQ CX, R9
+	JMP  copy_1_end
+
+copy_1_move_8through16:
+	MOVQ (R10), R14
+	MOVQ -8(R10)(CX*1), R15
+	MOVQ R14, (R9)
+	MOVQ R15, -8(R9)(CX*1)
+	ADDQ CX, R10
+	ADDQ CX, R9
+
+copy_1_end:
+	ADDQ CX, R11
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R11, CX
+	ADDQ 40(SP), CX
+	CMPQ R12, CX
+	JG   error_match_off_too_big
+	CMPQ R12, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ R12, CX
+	SUBQ R11, CX
+	JLS  copy_match
+	MOVQ 48(SP), R14
+	SUBQ CX, R14
+	CMPQ R13, CX
+	JG   copy_all_from_history
+	MOVQ R13, CX
+	SUBQ $0x10, CX
+	JB   copy_4_small
+
+copy_4_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, CX
+	JAE    copy_4_loop
+	LEAQ   16(R14)(CX*1), R14
+	LEAQ   16(R9)(CX*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_4_end
+
+copy_4_small:
+	CMPQ R13, $0x03
+	JE   copy_4_move_3
+	CMPQ R13, $0x08
+	JB   copy_4_move_4through7
+	JMP  copy_4_move_8through16
+
+copy_4_move_3:
+	MOVW (R14), CX
+	MOVB 2(R14), R12
+	MOVW CX, (R9)
+	MOVB R12, 2(R9)
+	ADDQ R13, R14
+	ADDQ R13, R9
+	JMP  copy_4_end
+
+copy_4_move_4through7:
+	MOVL (R14), CX
+	MOVL -4(R14)(R13*1), R12
+	MOVL CX, (R9)
+	MOVL R12, -4(R9)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R9
+	JMP  copy_4_end
+
+copy_4_move_8through16:
+	MOVQ (R14), CX
+	MOVQ -8(R14)(R13*1), R12
+	MOVQ CX, (R9)
+	MOVQ R12, -8(R9)(R13*1)
+	ADDQ R13, R14
+	ADDQ R13, R9
+
+copy_4_end:
+	ADDQ R13, R11
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	MOVQ CX, R15
+	SUBQ $0x10, R15
+	JB   copy_5_small
+
+copy_5_loop:
+	MOVUPS (R14), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, R14
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R15
+	JAE    copy_5_loop
+	LEAQ   16(R14)(R15*1), R14
+	LEAQ   16(R9)(R15*1), R9
+	MOVUPS -16(R14), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_5_end
+
+copy_5_small:
+	CMPQ CX, $0x03
+	JE   copy_5_move_3
+	JB   copy_5_move_1or2
+	CMPQ CX, $0x08
+	JB   copy_5_move_4through7
+	JMP  copy_5_move_8through16
+
+copy_5_move_1or2:
+	MOVB (R14), R15
+	MOVB -1(R14)(CX*1), BP
+	MOVB R15, (R9)
+	MOVB BP, -1(R9)(CX*1)
+	ADDQ CX, R14
+	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_3:
+	MOVW (R14), R15
+	MOVB 2(R14), BP
+	MOVW R15, (R9)
+	MOVB BP, 2(R9)
+	ADDQ CX, R14
+	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_4through7:
+	MOVL (R14), R15
+	MOVL -4(R14)(CX*1), BP
+	MOVL R15, (R9)
+	MOVL BP, -4(R9)(CX*1)
+	ADDQ CX, R14
+	ADDQ CX, R9
+	JMP  copy_5_end
+
+copy_5_move_8through16:
+	MOVQ (R14), R15
+	MOVQ -8(R14)(CX*1), BP
+	MOVQ R15, (R9)
+	MOVQ BP, -8(R9)(CX*1)
+	ADDQ CX, R14
+	ADDQ CX, R9
+
+copy_5_end:
+	ADDQ CX, R11
+	SUBQ CX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	MOVQ R9, CX
+	SUBQ R12, CX
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, R11
+	MOVQ R13, R12
+	SUBQ $0x10, R12
+	JB   copy_2_small
+
+copy_2_loop:
+	MOVUPS (CX), X0
+	MOVUPS X0, (R9)
+	ADDQ   $0x10, CX
+	ADDQ   $0x10, R9
+	SUBQ   $0x10, R12
+	JAE    copy_2_loop
+	LEAQ   16(CX)(R12*1), CX
+	LEAQ   16(R9)(R12*1), R9
+	MOVUPS -16(CX), X0
+	MOVUPS X0, -16(R9)
+	JMP    copy_2_end
+
+copy_2_small:
+	CMPQ R13, $0x03
+	JE   copy_2_move_3
+	JB   copy_2_move_1or2
+	CMPQ R13, $0x08
+	JB   copy_2_move_4through7
+	JMP  copy_2_move_8through16
+
+copy_2_move_1or2:
+	MOVB (CX), R12
+	MOVB -1(CX)(R13*1), R14
+	MOVB R12, (R9)
+	MOVB R14, -1(R9)(R13*1)
+	ADDQ R13, CX
+	ADDQ R13, R9
+	JMP  copy_2_end
+
+copy_2_move_3:
+	MOVW (CX), R12
+	MOVB 2(CX), R14
+	MOVW R12, (R9)
+	MOVB R14, 2(R9)
+	ADDQ R13, CX
+	ADDQ R13, R9
+	JMP  copy_2_end
+
+copy_2_move_4through7:
+	MOVL (CX), R12
+	MOVL -4(CX)(R13*1), R14
+	MOVL R12, (R9)
+	MOVL R14, -4(R9)(R13*1)
+	ADDQ R13, CX
+	ADDQ R13, R9
+	JMP  copy_2_end
+
+copy_2_move_8through16:
+	MOVQ (CX), R12
+	MOVQ -8(CX)(R13*1), R14
+	MOVQ R12, (R9)
+	MOVQ R14, -8(R9)(R13*1)
+	ADDQ R13, CX
+	ADDQ R13, R9
+
+copy_2_end:
+	JMP handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R11
+
+copy_slow_3:
+	MOVB (CX), R12
+	MOVB R12, (R9)
+	INCQ CX
+	INCQ R9
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), CX
+	MOVQ AX, 24(CX)
+	MOVB DL, 32(CX)
+	MOVQ BX, 8(CX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R11, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R10
+	MOVQ R10, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
new file mode 100644
index 0000000000..2fb35b788c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
@@ -0,0 +1,237 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package zstd
+
+import (
+	"fmt"
+	"io"
+)
+
+// decode sequences from the stream with the provided history but without dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+	return false, nil
+}
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+	br := s.br
+
+	// Grab full sizes tables, to avoid bounds checks.
+	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
+	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+	s.seqSize = 0
+	litRemain := len(s.literals)
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+	for i := range seqs {
+		var ll, mo, ml int
+		if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
+			// inlined function:
+			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
+
+			// Final will not read from stream.
+			var llB, mlB, moB uint8
+			ll, llB = llState.final()
+			ml, mlB = mlState.final()
+			mo, moB = ofState.final()
+
+			// extra bits are stored in reverse order.
+			br.fillFast()
+			mo += br.getBits(moB)
+			if s.maxBits > 32 {
+				br.fillFast()
+			}
+			ml += br.getBits(mlB)
+			ll += br.getBits(llB)
+
+			if moB > 1 {
+				s.prevOffset[2] = s.prevOffset[1]
+				s.prevOffset[1] = s.prevOffset[0]
+				s.prevOffset[0] = mo
+			} else {
+				// mo = s.adjustOffset(mo, ll, moB)
+				// Inlined for rather big speedup
+				if ll == 0 {
+					// There is an exception though, when current sequence's literals_length = 0.
+					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+					mo++
+				}
+
+				if mo == 0 {
+					mo = s.prevOffset[0]
+				} else {
+					var temp int
+					if mo == 3 {
+						temp = s.prevOffset[0] - 1
+					} else {
+						temp = s.prevOffset[mo]
+					}
+
+					if temp == 0 {
+						// 0 is not valid; input is corrupted; force offset to 1
+						println("WARNING: temp was 0")
+						temp = 1
+					}
+
+					if mo != 1 {
+						s.prevOffset[2] = s.prevOffset[1]
+					}
+					s.prevOffset[1] = s.prevOffset[0]
+					s.prevOffset[0] = temp
+					mo = temp
+				}
+			}
+			br.fillFast()
+		} else {
+			if br.overread() {
+				if debugDecoder {
+					printf("reading sequence %d, exceeded available data\n", i)
+				}
+				return io.ErrUnexpectedEOF
+			}
+			ll, mo, ml = s.next(br, llState, mlState, ofState)
+			br.fill()
+		}
+
+		if debugSequences {
+			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
+		}
+		// Evaluate.
+		// We might be doing this async, so do it early.
+		if mo == 0 && ml > 0 {
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+		}
+		if ml > maxMatchLen {
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+		}
+		s.seqSize += ll + ml
+		if s.seqSize > maxBlockSize {
+			return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
+		}
+		litRemain -= ll
+		if litRemain < 0 {
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
+		}
+		seqs[i] = seqVals{
+			ll: ll,
+			ml: ml,
+			mo: mo,
+		}
+		if i == len(seqs)-1 {
+			// This is the last sequence, so we shouldn't update state.
+			break
+		}
+
+		// Manually inlined, ~ 5-20% faster
+		// Update all 3 states at once. Approx 20% faster.
+		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
+		if nBits == 0 {
+			llState = llTable[llState.newState()&maxTableMask]
+			mlState = mlTable[mlState.newState()&maxTableMask]
+			ofState = ofTable[ofState.newState()&maxTableMask]
+		} else {
+			bits := br.get32BitsFast(nBits)
+			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
+			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits >> (ofState.nbBits() & 31))
+			lowBits &= bitMask[mlState.nbBits()&15]
+			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
+			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
+		}
+	}
+	s.seqSize += litRemain
+	if s.seqSize > maxBlockSize {
+		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+	}
+	return err
+}
+
+// executeSimple handles cases when a dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize > cap(s.out) {
+		addBytes := s.seqSize + len(s.out)
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	for _, seq := range seqs {
+		// Add literals
+		copy(out[t:], s.literals[:seq.ll])
+		t += seq.ll
+		s.literals = s.literals[seq.ll:]
+
+		// Malformed input
+		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
+			return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
+		}
+
+		// Copy from history.
+		if v := seq.mo - t; v > 0 {
+			// v is the start position in history from end.
+			start := len(hist) - v
+			if seq.ml > v {
+				// Some goes into the current block.
+				// Copy remainder of history
+				copy(out[t:], hist[start:])
+				t += v
+				seq.ml -= v
+			} else {
+				copy(out[t:], hist[start:start+seq.ml])
+				t += seq.ml
+				continue
+			}
+		}
+
+		// We must be in the current buffer now
+		if seq.ml > 0 {
+			start := t - seq.mo
+			if seq.ml <= t-start {
+				// No overlap
+				copy(out[t:], out[start:start+seq.ml])
+				t += seq.ml
+			} else {
+				// Overlapping copy
+				// Extend destination slice and copy one byte at the time.
+				src := out[start : start+seq.ml]
+				dst := out[t:]
+				dst = dst[:len(src)]
+				t += len(src)
+				// Destination is the space we just added.
+				for i := range src {
+					dst[i] = src[i]
+				}
+			}
+		}
+	}
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqenc.go b/vendor/github.com/klauspost/compress/zstd/seqenc.go
index 36bcc3cc02..8014174a77 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqenc.go
@@ -35,7 +35,6 @@ func (s *seqCoders) setPrev(ll, ml, of *fseEncoder) {
 		// Ensure we cannot reuse by accident
 		prevEnc := *prev
 		prevEnc.symbolLen = 0
-		return
 	}
 	compareSwap(ll, &s.llEnc, &s.llPrev)
 	compareSwap(ml, &s.mlEnc, &s.mlPrev)
diff --git a/vendor/github.com/klauspost/compress/zstd/snappy.go b/vendor/github.com/klauspost/compress/zstd/snappy.go
index 841fd95acc..ec13594e89 100644
--- a/vendor/github.com/klauspost/compress/zstd/snappy.go
+++ b/vendor/github.com/klauspost/compress/zstd/snappy.go
@@ -11,7 +11,7 @@ import (
 	"io"
 
 	"github.com/klauspost/compress/huff0"
-	"github.com/klauspost/compress/snappy"
+	snappy "github.com/klauspost/compress/internal/snapref"
 )
 
 const (
@@ -95,10 +95,9 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 	var written int64
 	var readHeader bool
 	{
-		var header []byte
-		var n int
-		header, r.err = frameHeader{WindowSize: snappyMaxBlockSize}.appendTo(r.buf[:0])
+		header := frameHeader{WindowSize: snappyMaxBlockSize}.appendTo(r.buf[:0])
 
+		var n int
 		n, r.err = w.Write(header)
 		if r.err != nil {
 			return written, r.err
@@ -185,7 +184,6 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 				r.block.reset(nil)
 				r.block.literals, err = snappy.Decode(r.block.literals[:n], r.buf[snappyChecksumSize:chunkLen])
 				if err != nil {
-					println("snappy.Decode:", err)
 					return written, err
 				}
 				err = r.block.encodeLits(r.block.literals, false)
@@ -204,7 +202,7 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 			written += int64(n)
 			continue
 		case chunkTypeUncompressedData:
-			if debug {
+			if debugEncoder {
 				println("Uncompressed, chunklen", chunkLen)
 			}
 			// Section 4.3. Uncompressed data (chunk type 0x01).
@@ -247,7 +245,7 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 			continue
 
 		case chunkTypeStreamIdentifier:
-			if debug {
+			if debugEncoder {
 				println("stream id", chunkLen, len(snappyMagicBody))
 			}
 			// Section 4.1. Stream identifier (chunk type 0xff).
@@ -417,7 +415,7 @@ var crcTable = crc32.MakeTable(crc32.Castagnoli)
 // https://github.com/google/snappy/blob/master/framing_format.txt
 func snappyCRC(b []byte) uint32 {
 	c := crc32.Update(0, crcTable, b)
-	return uint32(c>>15|c<<17) + 0xa282ead8
+	return c>>15 | c<<17 + 0xa282ead8
 }
 
 // snappyDecodedLen returns the length of the decoded block and the number of bytes
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
new file mode 100644
index 0000000000..29c15c8c4e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -0,0 +1,141 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package zstd
+
+import (
+	"errors"
+	"io"
+	"sync"
+)
+
+// ZipMethodWinZip is the method for Zstandard compressed data inside Zip files for WinZip.
+// See https://www.winzip.com/win/en/comp_info.html
+const ZipMethodWinZip = 93
+
+// ZipMethodPKWare is the original method number used by PKWARE to indicate Zstandard compression.
+// Deprecated: This has been deprecated by PKWARE, use ZipMethodWinZip instead for compression.
+// See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.9.TXT
+const ZipMethodPKWare = 20
+
+// zipReaderPool is the default reader pool.
+var zipReaderPool = sync.Pool{New: func() interface{} {
+	z, err := NewReader(nil, WithDecoderLowmem(true), WithDecoderMaxWindow(128<<20), WithDecoderConcurrency(1))
+	if err != nil {
+		panic(err)
+	}
+	return z
+}}
+
+// newZipReader creates a pooled zip decompressor.
+func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser {
+	pool := &zipReaderPool
+	if len(opts) > 0 {
+		opts = append([]DOption{WithDecoderLowmem(true), WithDecoderMaxWindow(128 << 20)}, opts...)
+		// Force concurrency 1
+		opts = append(opts, WithDecoderConcurrency(1))
+		// Create our own pool
+		pool = &sync.Pool{}
+	}
+	return func(r io.Reader) io.ReadCloser {
+		dec, ok := pool.Get().(*Decoder)
+		if ok {
+			dec.Reset(r)
+		} else {
+			d, err := NewReader(r, opts...)
+			if err != nil {
+				panic(err)
+			}
+			dec = d
+		}
+		return &pooledZipReader{dec: dec, pool: pool}
+	}
+}
+
+type pooledZipReader struct {
+	mu   sync.Mutex // guards Close and Read
+	pool *sync.Pool
+	dec  *Decoder
+}
+
+func (r *pooledZipReader) Read(p []byte) (n int, err error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if r.dec == nil {
+		return 0, errors.New("read after close or EOF")
+	}
+	dec, err := r.dec.Read(p)
+	if err == io.EOF {
+		r.dec.Reset(nil)
+		r.pool.Put(r.dec)
+		r.dec = nil
+	}
+	return dec, err
+}
+
+func (r *pooledZipReader) Close() error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	var err error
+	if r.dec != nil {
+		err = r.dec.Reset(nil)
+		r.pool.Put(r.dec)
+		r.dec = nil
+	}
+	return err
+}
+
+type pooledZipWriter struct {
+	mu   sync.Mutex // guards Close and Read
+	enc  *Encoder
+	pool *sync.Pool
+}
+
+func (w *pooledZipWriter) Write(p []byte) (n int, err error) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.enc == nil {
+		return 0, errors.New("Write after Close")
+	}
+	return w.enc.Write(p)
+}
+
+func (w *pooledZipWriter) Close() error {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	var err error
+	if w.enc != nil {
+		err = w.enc.Close()
+		w.pool.Put(w.enc)
+		w.enc = nil
+	}
+	return err
+}
+
+// ZipCompressor returns a compressor that can be registered with zip libraries.
+// The provided encoder options will be used on all encodes.
+func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
+	var pool sync.Pool
+	return func(w io.Writer) (io.WriteCloser, error) {
+		enc, ok := pool.Get().(*Encoder)
+		if ok {
+			enc.Reset(w)
+		} else {
+			var err error
+			enc, err = NewWriter(w, opts...)
+			if err != nil {
+				return nil, err
+			}
+		}
+		return &pooledZipWriter{enc: enc, pool: &pool}, nil
+	}
+}
+
+// ZipDecompressor returns a decompressor that can be registered with zip libraries.
+// See ZipCompressor for example.
+// Options can be specified. WithDecoderConcurrency(1) is forced,
+// and by default a 128MB maximum decompression window is specified.
+// The window size can be overridden if required.
+func ZipDecompressor(opts ...DOption) func(r io.Reader) io.ReadCloser {
+	return newZipReader(opts...)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index 0807719c8b..066bef2a4f 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -4,15 +4,22 @@
 package zstd
 
 import (
+	"bytes"
+	"encoding/binary"
 	"errors"
 	"log"
 	"math"
-	"math/bits"
 )
 
 // enable debug printing
 const debug = false
 
+// enable encoding debug printing
+const debugEncoder = debug
+
+// enable decoding debug printing
+const debugDecoder = debug
+
 // Enable extra assertions.
 const debugAsserts = debug || false
 
@@ -28,8 +35,8 @@ const forcePreDef = false
 // zstdMinMatch is the minimum zstd match length.
 const zstdMinMatch = 3
 
-// Reset the buffer offset when reaching this.
-const bufferReset = math.MaxInt32 - MaxWindowSize
+// fcsUnknown is used for unknown frame content size.
+const fcsUnknown = math.MaxUint64
 
 var (
 	// ErrReservedBlockType is returned when a reserved block type is found.
@@ -44,6 +51,10 @@ var (
 	// Typically returned on invalid input.
 	ErrBlockTooSmall = errors.New("block too small")
 
+	// ErrUnexpectedBlockSize is returned when a block has unexpected size.
+	// Typically returned on invalid input.
+	ErrUnexpectedBlockSize = errors.New("unexpected block size")
+
 	// ErrMagicMismatch is returned when a "magic" number isn't what is expected.
 	// Typically this indicates wrong or corrupted input.
 	ErrMagicMismatch = errors.New("invalid input: magic number mismatch")
@@ -60,85 +71,55 @@ var (
 	ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit")
 
 	// ErrUnknownDictionary is returned if the dictionary ID is unknown.
-	// For the time being dictionaries are not supported.
 	ErrUnknownDictionary = errors.New("unknown dictionary")
 
 	// ErrFrameSizeExceeded is returned if the stated frame size is exceeded.
 	// This is only returned if SingleSegment is specified on the frame.
 	ErrFrameSizeExceeded = errors.New("frame size exceeded")
 
+	// ErrFrameSizeMismatch is returned if the stated frame size does not match the expected size.
+	// This is only returned if SingleSegment is specified on the frame.
+	ErrFrameSizeMismatch = errors.New("frame size does not match size on stream")
+
 	// ErrCRCMismatch is returned if CRC mismatches.
 	ErrCRCMismatch = errors.New("CRC check failed")
 
 	// ErrDecoderClosed will be returned if the Decoder was used after
 	// Close has been called.
 	ErrDecoderClosed = errors.New("decoder used after Close")
+
+	// ErrEncoderClosed will be returned if the Encoder was used after
+	// Close has been called.
+	ErrEncoderClosed = errors.New("encoder used after Close")
+
+	// ErrDecoderNilInput is returned when a nil Reader was provided
+	// and an operation other than Reset/DecodeAll/Close was attempted.
+	ErrDecoderNilInput = errors.New("nil input provided as reader")
 )
 
 func println(a ...interface{}) {
-	if debug {
+	if debug || debugDecoder || debugEncoder {
 		log.Println(a...)
 	}
 }
 
 func printf(format string, a ...interface{}) {
-	if debug {
+	if debug || debugDecoder || debugEncoder {
 		log.Printf(format, a...)
 	}
 }
 
-// matchLenFast does matching, but will not match the last up to 7 bytes.
-func matchLenFast(a, b []byte) int {
-	endI := len(a) & (math.MaxInt32 - 7)
-	for i := 0; i < endI; i += 8 {
-		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-			return i + bits.TrailingZeros64(diff)>>3
-		}
-	}
-	return endI
-}
-
-// matchLen returns the maximum length.
-// a must be the shortest of the two.
-// The function also returns whether all bytes matched.
-func matchLen(a, b []byte) int {
-	b = b[:len(a)]
-	for i := 0; i < len(a)-7; i += 8 {
-		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-			return i + (bits.TrailingZeros64(diff) >> 3)
-		}
-	}
-
-	checked := (len(a) >> 3) << 3
-	a = a[checked:]
-	b = b[checked:]
-	for i := range a {
-		if a[i] != b[i] {
-			return i + checked
-		}
-	}
-	return len(a) + checked
-}
-
 func load3232(b []byte, i int32) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return binary.LittleEndian.Uint32(b[:len(b):len(b)][i:])
 }
 
 func load6432(b []byte, i int32) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[:len(b):len(b)][i:])
 }
 
-func load64(b []byte, i int) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+type byter interface {
+	Bytes() []byte
+	Len() int
 }
+
+var _ byter = &bytes.Buffer{}
diff --git a/vendor/github.com/klauspost/pgzip/.travis.yml b/vendor/github.com/klauspost/pgzip/.travis.yml
index acfec4bb09..34704000e9 100644
--- a/vendor/github.com/klauspost/pgzip/.travis.yml
+++ b/vendor/github.com/klauspost/pgzip/.travis.yml
@@ -1,3 +1,7 @@
+
+arch:
+  - amd64
+  - ppc64le
 language: go
 
 os:
diff --git a/vendor/github.com/klauspost/pgzip/LICENSE b/vendor/github.com/klauspost/pgzip/LICENSE
index 3909da4103..2bdc0d7517 100644
--- a/vendor/github.com/klauspost/pgzip/LICENSE
+++ b/vendor/github.com/klauspost/pgzip/LICENSE
@@ -1,4 +1,4 @@
-MIT License
+The MIT License (MIT)
 
 Copyright (c) 2014 Klaus Post
 
@@ -19,3 +19,4 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
diff --git a/vendor/github.com/klauspost/pgzip/README.md b/vendor/github.com/klauspost/pgzip/README.md
index 171b978fdc..ecc8726fa7 100644
--- a/vendor/github.com/klauspost/pgzip/README.md
+++ b/vendor/github.com/klauspost/pgzip/README.md
@@ -104,13 +104,12 @@ Content is [Matt Mahoneys 10GB corpus](http://mattmahoney.net/dc/10gb.html). Com
 
 Compressor  | MB/sec   | speedup | size | size overhead (lower=better)
 ------------|----------|---------|------|---------
-[gzip](http://golang.org/pkg/compress/gzip) (golang) | 15.44MB/s (1 thread) | 1.0x | 4781329307 | 0%
-[gzip](http://github.com/klauspost/compress/gzip) (klauspost) | 135.04MB/s (1 thread) | 8.74x | 4894858258 | +2.37%
-[pgzip](https://github.com/klauspost/pgzip) (klauspost) | 1573.23MB/s| 101.9x | 4902285651 | +2.53%
-[bgzf](https://godoc.org/github.com/biogo/hts/bgzf) (biogo) | 361.40MB/s | 23.4x | 4869686090 | +1.85%
-[pargzip](https://godoc.org/github.com/golang/build/pargzip) (builder) | 306.01MB/s | 19.8x | 4786890417 | +0.12%
+[gzip](http://golang.org/pkg/compress/gzip) (golang) | 16.91MB/s (1 thread) | 1.0x | 4781329307 | 0%
+[gzip](http://github.com/klauspost/compress/gzip) (klauspost) | 127.10MB/s (1 thread) | 7.52x | 4885366806 | +2.17%
+[pgzip](https://github.com/klauspost/pgzip) (klauspost) | 2085.35MB/s|  123.34x | 4886132566 | +2.19%
+[pargzip](https://godoc.org/github.com/golang/build/pargzip) (builder) | 334.04MB/s | 19.76x | 4786890417 | +0.12%
 
-pgzip also contains a [linear time compression](https://github.com/klauspost/compress#linear-time-compression-huffman-only) mode, that will allow compression at ~250MB per core per second, independent of the content.
+pgzip also contains a [huffman only compression](https://github.com/klauspost/compress#linear-time-compression-huffman-only) mode, that will allow compression at ~450MB per core per second, largely independent of the content.
 
 See the [complete sheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing) for different content types and compression settings.
 
@@ -123,7 +122,7 @@ In the example above, the numbers are as follows on a 4 CPU machine:
 Decompressor | Time | Speedup
 -------------|------|--------
 [gzip](http://golang.org/pkg/compress/gzip) (golang) | 1m28.85s | 0%
-[pgzip](https://github.com/klauspost/pgzip) (golang) | 43.48s | 104%
+[pgzip](https://github.com/klauspost/pgzip) (klauspost) | 43.48s | 104%
 
 But wait, since gzip decompression is inherently singlethreaded (aside from CRC calculation) how can it be more than 100% faster?  Because pgzip due to its design also acts as a buffer. When using unbuffered gzip, you are also waiting for io when you are decompressing. If the gzip decoder can keep up, it will always have data ready for your reader, and you will not be waiting for input to the gzip decompressor to complete.
 
diff --git a/vendor/github.com/klauspost/pgzip/gunzip.go b/vendor/github.com/klauspost/pgzip/gunzip.go
index d1ae730b25..3c4b32f16f 100644
--- a/vendor/github.com/klauspost/pgzip/gunzip.go
+++ b/vendor/github.com/klauspost/pgzip/gunzip.go
@@ -513,6 +513,19 @@ func (z *Reader) Read(p []byte) (n int, err error) {
 
 func (z *Reader) WriteTo(w io.Writer) (n int64, err error) {
 	total := int64(0)
+	avail := z.current[z.roff:]
+	if len(avail) != 0 {
+		n, err := w.Write(avail)
+		if n != len(avail) {
+			return total, io.ErrShortWrite
+		}
+		total += int64(n)
+		if err != nil {
+			return total, err
+		}
+		z.blockPool <- z.current
+		z.current = nil
+	}
 	for {
 		if z.err != nil {
 			return total, z.err
diff --git a/vendor/github.com/mholt/archiver/v3/.gitignore b/vendor/github.com/mholt/archiver/v3/.gitignore
deleted file mode 100644
index 4a87fc1aaf..0000000000
--- a/vendor/github.com/mholt/archiver/v3/.gitignore
+++ /dev/null
@@ -1,10 +0,0 @@
-/arc
-/cmd/arc/arc
-/dist/
-/vendor/
-
-.DS_Store
-_gitignore
-builds/
-*.test
-.*.sw*
diff --git a/vendor/github.com/mholt/archiver/v3/.goreleaser.yml b/vendor/github.com/mholt/archiver/v3/.goreleaser.yml
deleted file mode 100644
index 13cc2a679b..0000000000
--- a/vendor/github.com/mholt/archiver/v3/.goreleaser.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-# This is an example goreleaser.yaml file with some sane defaults.
-# Make sure to check the documentation at http://goreleaser.com
-project_name: arc
-before:
-  hooks:
-    # You may remove this if you don't use go modules.
-    - go mod download
-    # you may remove this if you don't need go generate
-    - go generate ./...
-builds:
-  -
-    env:
-      - CGO_ENABLED=0
-    main: ./cmd/arc
-    goos:
-      - linux
-      - windows
-      - darwin
-    goarch:
-      - 386
-      - amd64
-      - arm
-      - arm64
-    goarm:
-      - 6
-      - 7
-archives:
-  -
-    format: binary
-    replacements:
-      darwin: mac
-checksum:
-  name_template: 'checksums.txt'
-snapshot:
-  name_template: "{{ .Tag }}-next"
-changelog:
-  sort: asc
-  filters:
-    exclude:
-      - '^docs:'
-      - '^test:'
diff --git a/vendor/github.com/mholt/archiver/v3/.prettierrc b/vendor/github.com/mholt/archiver/v3/.prettierrc
deleted file mode 100644
index f9f5139c57..0000000000
--- a/vendor/github.com/mholt/archiver/v3/.prettierrc
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "bracketSpacing": true,
-  "printWidth": 120,
-}
diff --git a/vendor/github.com/mholt/archiver/v3/README.md b/vendor/github.com/mholt/archiver/v3/README.md
deleted file mode 100644
index c8de5e7e55..0000000000
--- a/vendor/github.com/mholt/archiver/v3/README.md
+++ /dev/null
@@ -1,324 +0,0 @@
-# archiver [![archiver GoDoc](https://img.shields.io/badge/reference-godoc-blue.svg?style=flat-square)](https://pkg.go.dev/github.com/mholt/archiver?tab=doc) [![Ubuntu-latest](https://github.com/mholt/archiver/actions/workflows/ubuntu-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/ubuntu-latest.yml) [![Macos-latest](https://github.com/mholt/archiver/actions/workflows/macos-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/macos-latest.yml) [![Windows-latest](https://github.com/mholt/archiver/actions/workflows/windows-latest.yml/badge.svg)](https://github.com/mholt/archiver/actions/workflows/windows-latest.yml)
-
-Introducing **Archiver 3.1** - a cross-platform, multi-format archive utility and Go library. A powerful and flexible library meets an elegant CLI in this generic replacement for several platform-specific or format-specific archive utilities.
-
-## Features
-
-Package archiver makes it trivially easy to make and extract common archive formats such as tarball (and its compressed variants) and zip. Simply name the input and output file(s). The `arc` command runs the same on all platforms and has no external dependencies (not even libc). It is powered by the Go standard library and several third-party, pure-Go libraries.
-
-Files are put into the root of the archive; directories are recursively added, preserving structure.
-
-- Make whole archives from a list of files
-- Open whole archives to a folder
-- Extract specific files/folders from archives
-- Stream files in and out of archives without needing actual files on disk
-- Traverse archive contents without loading them
-- Compress files
-- Decompress files
-- Streaming compression and decompression
-- Several archive and compression formats supported
-
-### Format-dependent features
-
-- Gzip is multithreaded
-- Optionally create a top-level folder to avoid littering a directory or archive root with files
-- Toggle overwrite existing files
-- Adjust compression level
-- Zip: store (not compress) already-compressed files
-- Make all necessary directories
-- Open password-protected RAR archives
-- Optionally continue with other files after an error
-
-### Supported compression formats
-
-- brotli (br)
-- bzip2 (bz2)
-- flate (zip)
-- gzip (gz)
-- lz4
-- snappy (sz)
-- xz
-- zstandard (zstd)
-
-### Supported archive formats
-
-- .zip
-- .tar (including any compressed variants like .tar.gz)
-- .rar (read-only)
-
-Tar files can optionally be compressed using any of the above compression formats.
-
-## GoDoc
-
-See <https://pkg.go.dev/github.com/mholt/archiver/v3>
-
-## Install
-
-### With webi
-
-[`webi`](https://webinstall.dev/arc) will install `webi` and `arc` to `~/.local/bin/` and update your `PATH`.
-
-#### Mac, Linux, Raspberry Pi
-
-```bash
-curl -fsS https://webinstall.dev/arc | bash
-```
-
-#### Windows 10
-
-```pwsh
-curl.exe -fsS -A MS https://webinstall.dev/arc | powershell
-```
-
-### With Go
-
-To install the runnable binary to your \$GOPATH/bin:
-
-```bash
-go install github.com/mholt/archiver/v3/cmd/arc@latest
-```
-
-### Manually
-
-To install manually
-
-1. Download the binary for your platform from the [Github Releases](https://github.com/mholt/archiver/releases) page.
-2. Move the binary to a location in your path, for example:
-   - without `sudo`:
-     ```bash
-     chmod a+x ~/Downloads/arc_*
-     mkdir -p ~/.local/bin
-     mv ~/Downloads/arc_* ~/.local/bin/arc
-     ```
-   - as `root`:
-     ```bash
-     chmod a+x ~/Downloads/arc_*
-     sudo mkdir -p /usr/local/bin
-     sudo mv ~/Downloads/arc_* /usr/local/bin/arc
-     ```
-3. If needed, update `~/.bashrc` or `~/.profile` to include add `arc` in your `PATH`, for example:
-   ```
-   echo 'PATH="$HOME:/.local/bin:$PATH"' >> ~/.bashrc
-   ```
-
-## Build from Source
-
-You can successfully build `arc` with just the go tooling, or with `goreleaser`.
-
-### With `go`
-
-```bash
-go build cmd/arc/*.go
-```
-
-### Multi-platform with `goreleaser`
-
-Builds with `goreleaser` will also include version info.
-
-```bash
-goreleaser --snapshot --skip-publish --rm-dist
-```
-
-## Command Use
-
-### Make new archive
-
-```bash
-# Syntax: arc archive [archive name] [input files...]
-
-arc archive test.tar.gz file1.txt images/file2.jpg folder/subfolder
-```
-
-(At least one input file is required.)
-
-### Extract entire archive
-
-```bash
-# Syntax: arc unarchive [archive name] [destination]
-
-arc unarchive test.tar.gz
-```
-
-(The destination path is optional; default is current directory.)
-
-The archive name must end with a supported file extension&mdash;this is how it knows what kind of archive to make. Run `arc help` for more help.
-
-### List archive contents
-
-```bash
-# Syntax: arc ls [archive name]
-
-arc ls caddy_dist.tar.gz
-```
-
-```txt
-drwxr-xr-x  matt    staff   0       2018-09-19 15:47:18 -0600 MDT   dist/
--rw-r--r--  matt    staff   6148    2017-08-07 18:34:22 -0600 MDT   dist/.DS_Store
--rw-r--r--  matt    staff   22481   2018-09-19 15:47:18 -0600 MDT   dist/CHANGES.txt
--rw-r--r--  matt    staff   17189   2018-09-19 15:47:18 -0600 MDT   dist/EULA.txt
--rw-r--r--  matt    staff   25261   2016-03-07 16:32:00 -0700 MST   dist/LICENSES.txt
--rw-r--r--  matt    staff   1017    2018-09-19 15:47:18 -0600 MDT   dist/README.txt
--rw-r--r--  matt    staff   288     2016-03-21 11:52:38 -0600 MDT   dist/gitcookie.sh.enc
-...
-```
-
-### Extract a specific file or folder from an archive
-
-```bash
-# Syntax: arc extract [archive name] [path in archive] [destination on disk]
-
-arc extract test.tar.gz foo/hello.txt extracted/hello.txt
-```
-
-### Compress a single file
-
-```bash
-# Syntax: arc compress [input file] [output file]
-
-arc compress test.txt compressed_test.txt.gz
-arc compress test.txt gz
-```
-
-For convenience, the output file (second argument) may simply be a compression format (without leading dot), in which case the output filename will be the same as the input filename but with the format extension appended, and the input file will be deleted if successful.
-
-### Decompress a single file
-
-```bash
-# Syntax: arc decompress [input file] [output file]
-
-arc decompress test.txt.gz original_test.txt
-arc decompress test.txt.gz
-```
-
-For convenience, the output file (second argument) may be omitted. In that case, the output filename will have the same name as the input filename, but with the compression extension stripped from the end; and the input file will be deleted if successful.
-
-### Flags
-
-Flags are specified before the subcommand. Use `arc help` or `arc -h` to get usage help and a description of flags with their default values.
-
-## Library Use
-
-The archiver package allows you to easily create and open archives, walk their contents, extract specific files, compress and decompress files, and even stream archives in and out using pure io.Reader and io.Writer interfaces, without ever needing to touch the disk.
-
-To use as a dependency in your project:
-
-```bash
-go get github.com/mholt/archiver/v3
-```
-
-```go
-import "github.com/mholt/archiver/v3"
-```
-
-[See the package's GoDoc](https://pkg.go.dev/github.com/mholt/archiver?tab=doc) for full API documentation.
-
-For example, creating or unpacking an archive file:
-
-```go
-err := archiver.Archive([]string{"testdata", "other/file.txt"}, "test.zip")
-// ...
-err = archiver.Unarchive("test.tar.gz", "test")
-```
-
-The archive format is determined by file extension. (There are [several functions in this package](https://pkg.go.dev/github.com/mholt/archiver?tab=doc) which perform a task by inferring the format from file extension or file header, including `Archive()`, `Unarchive()`, `CompressFile()`, and `DecompressFile()`.)
-
-To configure the archiver used or perform, create an instance of the format's type:
-
-```go
-z := archiver.Zip{
-	CompressionLevel:       flate.DefaultCompression,
-	MkdirAll:               true,
-	SelectiveCompression:   true,
-	ContinueOnError:        false,
-	OverwriteExisting:      false,
-	ImplicitTopLevelFolder: false,
-}
-
-err := z.Archive([]string{"testdata", "other/file.txt"}, "/Users/matt/Desktop/test.zip")
-```
-
-Inspecting an archive:
-
-```go
-err = z.Walk("/Users/matt/Desktop/test.zip", func(f archiver.File) error {
-	zfh, ok := f.Header.(zip.FileHeader)
-	if ok {
-		fmt.Println("Filename:", zfh.Name)
-	}
-	return nil
-})
-```
-
-Streaming files into an archive that is being written to the HTTP response:
-
-```go
-err = z.Create(responseWriter)
-if err != nil {
-	return err
-}
-defer z.Close()
-
-for _, fname := range filenames {
-	info, err := os.Stat(fname)
-	if err != nil {
-		return err
-	}
-
-	// get file's name for the inside of the archive
-	internalName, err := archiver.NameInArchive(info, fname, fname)
-	if err != nil {
-		return err
-	}
-
-	// open the file
-	file, err := os.Open(f)
-	if err != nil {
-		return err
-	}
-
-	// write it to the archive
-	err = z.Write(archiver.File{
-		FileInfo: archiver.FileInfo{
-			FileInfo:   info,
-			CustomName: internalName,
-		},
-		ReadCloser: file,
-	})
-	file.Close()
-	if err != nil {
-		return err
-	}
-}
-```
-
-The `archiver.File` type allows you to use actual files with archives, or to mimic files when you only have streams.
-
-There's a lot more that can be done, too. [See the GoDoc](https://pkg.go.dev/github.com/mholt/archiver?tab=doc) for full API documentation.
-
-**Security note: This package does NOT attempt to mitigate zip-slip attacks.** It is [extremely difficult](https://github.com/rubyzip/rubyzip/pull/376) [to do properly](https://github.com/mholt/archiver/pull/65#issuecomment-395988244) and [seemingly impossible to mitigate effectively across platforms](https://github.com/golang/go/issues/20126). [Attempted fixes have broken processing of legitimate files in production](https://github.com/mholt/archiver/pull/70#issuecomment-423267320), rendering the program unusable. Our recommendation instead is to inspect the contents of an untrusted archive before extracting it (this package provides `Walkers`) and decide if you want to proceed with extraction.
-
-## Project Values
-
-This project has a few principle-based goals that guide its development:
-
-- **Do our thing really well.** Our thing is creating, opening, inspecting, compressing, and streaming archive files. It is not meant to be a replacement for specific archive format tools like tar, zip, etc. that have lots of features and customizability. (Some customizability is OK, but not to the extent that it becomes overly complicated or error-prone.)
-
-- **Have good tests.** Changes should be covered by tests.
-
-- **Limit dependencies.** Keep the package lightweight.
-
-- **Pure Go.** This means no cgo or other external/system dependencies. This package should be able to stand on its own and cross-compile easily to any platform -- and that includes its library dependencies.
-
-- **Idiomatic Go.** Keep interfaces small, variable names semantic, vet shows no errors, the linter is generally quiet, etc.
-
-- **Be elegant.** This package should be elegant to use and its code should be elegant when reading and testing. If it doesn't feel good, fix it up.
-
-- **Well-documented.** Use comments prudently; explain why non-obvious code is necessary (and use tests to enforce it). Keep the docs updated, and have examples where helpful.
-
-- **Keep it efficient.** This often means keep it simple. Fast code is valuable.
-
-- **Consensus.** Contributions should ideally be approved by multiple reviewers before being merged. Generally, avoid merging multi-chunk changes that do not go through at least one or two iterations/reviews. Except for trivial changes, PRs are seldom ready to merge right away.
-
-- **Have fun contributing.** Coding is awesome!
-
-We welcome contributions and appreciate your efforts! However, please open issues to discuss any changes before spending the time preparing a pull request. This will save time, reduce frustration, and help coordinate the work. Thank you!
diff --git a/vendor/github.com/mholt/archiver/v3/SECURITY.md b/vendor/github.com/mholt/archiver/v3/SECURITY.md
deleted file mode 100644
index f915712462..0000000000
--- a/vendor/github.com/mholt/archiver/v3/SECURITY.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Security Policy
-
-## Supported Versions
-
-| Version | Supported          |
-| ------- | ------------------ |
-| >= 3.x  | :white_check_mark: |
-| < 3.0   | :x:                |
-
-## Reporting a Vulnerability
-
-Please send the details to both of us:
-
-- AJ ONeal <coolaj86@gmail.com>
-- Matthew Holt <Matthew.Holt@gmail.com>
diff --git a/vendor/github.com/mholt/archiver/v3/archiver.go b/vendor/github.com/mholt/archiver/v3/archiver.go
deleted file mode 100644
index 6fdadadc4c..0000000000
--- a/vendor/github.com/mholt/archiver/v3/archiver.go
+++ /dev/null
@@ -1,540 +0,0 @@
-// Package archiver facilitates convenient, cross-platform, high-level archival
-// and compression operations for a variety of formats and compression algorithms.
-//
-// This package and its dependencies are written in pure Go (not cgo) and
-// have no external dependencies, so they should run on all major platforms.
-// (It also comes with a command for CLI use in the cmd/arc folder.)
-//
-// Each supported format or algorithm has a unique type definition that
-// implements the interfaces corresponding to the tasks they perform. For
-// example, the Tar type implements Reader, Writer, Archiver, Unarchiver,
-// Walker, and several other interfaces.
-//
-// The most common functions are implemented at the package level for
-// convenience: Archive, Unarchive, Walk, Extract, CompressFile, and
-// DecompressFile. With these, the format type is chosen implicitly,
-// and a sane default configuration is used.
-//
-// To customize a format's configuration, create an instance of its struct
-// with its fields set to the desired values. You can also use and customize
-// the handy Default* (replace the wildcard with the format's type name)
-// for a quick, one-off instance of the format's type.
-//
-// To obtain a new instance of a format's struct with the default config, use
-// the provided New*() functions. This is not required, however. An empty
-// struct of any type, for example &Zip{} is perfectly valid, so you may
-// create the structs manually, too. The examples on this page show how
-// either may be done.
-//
-// See the examples in this package for an idea of how to wield this package
-// for common tasks. Most of the examples which are specific to a certain
-// format type, for example Zip, can be applied to other types that implement
-// the same interfaces. For example, using Zip is very similar to using Tar
-// or TarGz (etc), and using Gz is very similar to using Sz or Xz (etc).
-//
-// When creating archives or compressing files using a specific instance of
-// the format's type, the name of the output file MUST match that of the
-// format, to prevent confusion later on. If you absolutely need a different
-// file extension, you may rename the file afterward.
-//
-// Values in this package are NOT safe for concurrent use. There is no
-// performance benefit of reusing them, and since they may contain important
-// state (especially while walking, reading, or writing), it is NOT
-// recommended to reuse values from this package or change their configuration
-// after they are in use.
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"os"
-	"path"
-	"path/filepath"
-	"runtime"
-	"strings"
-)
-
-// Archiver is a type that can create an archive file
-// from a list of source file names.
-type Archiver interface {
-	ExtensionChecker
-
-	// Archive adds all the files or folders in sources
-	// to an archive to be created at destination. Files
-	// are added to the root of the archive, and directories
-	// are walked and recursively added, preserving folder
-	// structure.
-	Archive(sources []string, destination string) error
-}
-
-// ExtensionChecker validates file extensions
-type ExtensionChecker interface {
-	CheckExt(name string) error
-}
-
-// FilenameChecker validates filenames to prevent path traversal attacks
-type FilenameChecker interface {
-	CheckPath(to, filename string) error
-}
-
-// Unarchiver is a type that can extract archive files
-// into a folder.
-type Unarchiver interface {
-	Unarchive(source, destination string) error
-}
-
-// Writer can write discrete byte streams of files to
-// an output stream.
-type Writer interface {
-	Create(out io.Writer) error
-	Write(f File) error
-	Close() error
-}
-
-// Reader can read discrete byte streams of files from
-// an input stream.
-type Reader interface {
-	Open(in io.Reader, size int64) error
-	Read() (File, error)
-	Close() error
-}
-
-// Extractor can extract a specific file from a source
-// archive to a specific destination folder on disk.
-type Extractor interface {
-	Extract(source, target, destination string) error
-}
-
-// File provides methods for accessing information about
-// or contents of a file within an archive.
-type File struct {
-	os.FileInfo
-
-	// The original header info; depends on
-	// type of archive -- could be nil, too.
-	Header interface{}
-
-	// Allow the file contents to be read (and closed)
-	io.ReadCloser
-}
-
-// FileInfo is an os.FileInfo but optionally with
-// a custom name, useful if dealing with files that
-// are not actual files on disk, or which have a
-// different name in an archive than on disk.
-type FileInfo struct {
-	os.FileInfo
-	CustomName string
-	// Stores path to the source.
-	// Used when reading a symlink.
-	SourcePath string
-}
-
-// Name returns fi.CustomName if not empty;
-// otherwise it returns fi.FileInfo.Name().
-func (fi FileInfo) Name() string {
-	if fi.CustomName != "" {
-		return fi.CustomName
-	}
-	return fi.FileInfo.Name()
-}
-
-// ReadFakeCloser is an io.Reader that has
-// a no-op close method to satisfy the
-// io.ReadCloser interface.
-type ReadFakeCloser struct {
-	io.Reader
-}
-
-// Close implements io.Closer.
-func (rfc ReadFakeCloser) Close() error { return nil }
-
-// Walker can walk an archive file and return information
-// about each item in the archive.
-type Walker interface {
-	Walk(archive string, walkFn WalkFunc) error
-}
-
-// WalkFunc is called at each item visited by Walk.
-// If an error is returned, the walk may continue
-// if the Walker is configured to continue on error.
-// The sole exception is the error value ErrStopWalk,
-// which stops the walk without an actual error.
-type WalkFunc func(f File) error
-
-// ErrStopWalk signals Walk to break without error.
-var ErrStopWalk = fmt.Errorf("walk stopped")
-
-// ErrFormatNotRecognized is an error that will be
-// returned if the file is not a valid archive format.
-var ErrFormatNotRecognized = fmt.Errorf("format not recognized")
-
-// Compressor compresses to out what it reads from in.
-// It also ensures a compatible or matching file extension.
-type Compressor interface {
-	ExtensionChecker
-	Compress(in io.Reader, out io.Writer) error
-}
-
-// Decompressor decompresses to out what it reads from in.
-type Decompressor interface {
-	Decompress(in io.Reader, out io.Writer) error
-}
-
-// Matcher is a type that can return whether the given
-// file appears to match the implementation's format.
-// Implementations should return the file's read position
-// to where it was when the method was called.
-type Matcher interface {
-	Match(io.ReadSeeker) (bool, error)
-}
-
-// Archive creates an archive of the source files to a new file at destination.
-// The archive format is chosen implicitly by file extension.
-func Archive(sources []string, destination string) error {
-	aIface, err := ByExtension(destination)
-	if err != nil {
-		return err
-	}
-	a, ok := aIface.(Archiver)
-	if !ok {
-		return fmt.Errorf("format specified by destination filename is not an archive format: %s (%T)", destination, aIface)
-	}
-	return a.Archive(sources, destination)
-}
-
-// Unarchive unarchives the given archive file into the destination folder.
-// The archive format is selected implicitly.
-func Unarchive(source, destination string) error {
-	uaIface, err := ByExtension(source)
-	if err != nil {
-		return err
-	}
-	u, ok := uaIface.(Unarchiver)
-	if !ok {
-		return fmt.Errorf("format specified by source filename is not an archive format: %s (%T)", source, uaIface)
-	}
-	return u.Unarchive(source, destination)
-}
-
-// Walk calls walkFn for each file within the given archive file.
-// The archive format is chosen implicitly.
-func Walk(archive string, walkFn WalkFunc) error {
-	wIface, err := ByExtension(archive)
-	if err != nil {
-		return err
-	}
-	w, ok := wIface.(Walker)
-	if !ok {
-		return fmt.Errorf("format specified by archive filename is not a walker format: %s (%T)", archive, wIface)
-	}
-	return w.Walk(archive, walkFn)
-}
-
-// Extract extracts a single file from the given source archive. If the target
-// is a directory, the entire folder will be extracted into destination. The
-// archive format is chosen implicitly.
-func Extract(source, target, destination string) error {
-	eIface, err := ByExtension(source)
-	if err != nil {
-		return err
-	}
-	e, ok := eIface.(Extractor)
-	if !ok {
-		return fmt.Errorf("format specified by source filename is not an extractor format: %s (%T)", source, eIface)
-	}
-	return e.Extract(source, target, destination)
-}
-
-// CompressFile is a convenience function to simply compress a file.
-// The compression algorithm is selected implicitly based on the
-// destination's extension.
-func CompressFile(source, destination string) error {
-	cIface, err := ByExtension(destination)
-	if err != nil {
-		return err
-	}
-	c, ok := cIface.(Compressor)
-	if !ok {
-		return fmt.Errorf("format specified by destination filename is not a recognized compression algorithm: %s", destination)
-	}
-	return FileCompressor{Compressor: c}.CompressFile(source, destination)
-}
-
-// DecompressFile is a convenience function to simply decompress a file.
-// The decompression algorithm is selected implicitly based on the
-// source's extension.
-func DecompressFile(source, destination string) error {
-	cIface, err := ByExtension(source)
-	if err != nil {
-		return err
-	}
-	c, ok := cIface.(Decompressor)
-	if !ok {
-		return fmt.Errorf("format specified by source filename is not a recognized compression algorithm: %s", source)
-	}
-	return FileCompressor{Decompressor: c}.DecompressFile(source, destination)
-}
-
-func fileExists(name string) bool {
-	_, err := os.Stat(name)
-	return !os.IsNotExist(err)
-}
-
-func mkdir(dirPath string, dirMode os.FileMode) error {
-	err := os.MkdirAll(dirPath, dirMode)
-	if err != nil {
-		return fmt.Errorf("%s: making directory: %v", dirPath, err)
-	}
-	return nil
-}
-
-func writeNewFile(fpath string, in io.Reader, fm os.FileMode) error {
-	err := os.MkdirAll(filepath.Dir(fpath), 0755)
-	if err != nil {
-		return fmt.Errorf("%s: making directory for file: %v", fpath, err)
-	}
-
-	out, err := os.Create(fpath)
-	if err != nil {
-		return fmt.Errorf("%s: creating new file: %v", fpath, err)
-	}
-	defer out.Close()
-
-	err = out.Chmod(fm)
-	if err != nil && runtime.GOOS != "windows" {
-		return fmt.Errorf("%s: changing file mode: %v", fpath, err)
-	}
-
-	_, err = io.Copy(out, in)
-	if err != nil {
-		return fmt.Errorf("%s: writing file: %v", fpath, err)
-	}
-	return nil
-}
-
-func writeNewSymbolicLink(fpath string, target string) error {
-	err := os.MkdirAll(filepath.Dir(fpath), 0755)
-	if err != nil {
-		return fmt.Errorf("%s: making directory for file: %v", fpath, err)
-	}
-
-	_, err = os.Lstat(fpath)
-	if err == nil {
-		err = os.Remove(fpath)
-		if err != nil {
-			return fmt.Errorf("%s: failed to unlink: %+v", fpath, err)
-		}
-	}
-
-	err = os.Symlink(target, fpath)
-	if err != nil {
-		return fmt.Errorf("%s: making symbolic link for: %v", fpath, err)
-	}
-	return nil
-}
-
-func writeNewHardLink(fpath string, target string) error {
-	err := os.MkdirAll(filepath.Dir(fpath), 0755)
-	if err != nil {
-		return fmt.Errorf("%s: making directory for file: %v", fpath, err)
-	}
-
-	_, err = os.Lstat(fpath)
-	if err == nil {
-		err = os.Remove(fpath)
-		if err != nil {
-			return fmt.Errorf("%s: failed to unlink: %+v", fpath, err)
-		}
-	}
-
-	err = os.Link(target, fpath)
-	if err != nil {
-		return fmt.Errorf("%s: making hard link for: %v", fpath, err)
-	}
-	return nil
-}
-
-func isSymlink(fi os.FileInfo) bool {
-	return fi.Mode()&os.ModeSymlink != 0
-}
-
-// within returns true if sub is within or equal to parent.
-func within(parent, sub string) bool {
-	rel, err := filepath.Rel(parent, sub)
-	if err != nil {
-		return false
-	}
-	return !strings.Contains(rel, "..")
-}
-
-// multipleTopLevels returns true if the paths do not
-// share a common top-level folder.
-func multipleTopLevels(paths []string) bool {
-	if len(paths) < 2 {
-		return false
-	}
-	var lastTop string
-	for _, p := range paths {
-		p = strings.TrimPrefix(strings.Replace(p, `\`, "/", -1), "/")
-		for {
-			next := path.Dir(p)
-			if next == "." {
-				break
-			}
-			p = next
-		}
-		if lastTop == "" {
-			lastTop = p
-		}
-		if p != lastTop {
-			return true
-		}
-	}
-	return false
-}
-
-// folderNameFromFileName returns a name for a folder
-// that is suitable based on the filename, which will
-// be stripped of its extensions.
-func folderNameFromFileName(filename string) string {
-	base := filepath.Base(filename)
-	firstDot := strings.Index(base, ".")
-	if firstDot > -1 {
-		return base[:firstDot]
-	}
-	return base
-}
-
-// makeNameInArchive returns the filename for the file given by fpath to be used within
-// the archive. sourceInfo is the FileInfo obtained by calling os.Stat on source, and baseDir
-// is an optional base directory that becomes the root of the archive. fpath should be the
-// unaltered file path of the file given to a filepath.WalkFunc.
-func makeNameInArchive(sourceInfo os.FileInfo, source, baseDir, fpath string) (string, error) {
-	name := filepath.Base(fpath) // start with the file or dir name
-	if sourceInfo.IsDir() {
-		// preserve internal directory structure; that's the path components
-		// between the source directory's leaf and this file's leaf
-		dir, err := filepath.Rel(filepath.Dir(source), filepath.Dir(fpath))
-		if err != nil {
-			return "", err
-		}
-		// prepend the internal directory structure to the leaf name,
-		// and convert path separators to forward slashes as per spec
-		name = path.Join(filepath.ToSlash(dir), name)
-	}
-	return path.Join(baseDir, name), nil // prepend the base directory
-}
-
-// NameInArchive returns a name for the file at fpath suitable for
-// the inside of an archive. The source and its associated sourceInfo
-// is the path where walking a directory started, and if no directory
-// was walked, source may == fpath. The returned name is essentially
-// the components of the path between source and fpath, preserving
-// the internal directory structure.
-func NameInArchive(sourceInfo os.FileInfo, source, fpath string) (string, error) {
-	return makeNameInArchive(sourceInfo, source, "", fpath)
-}
-
-// ByExtension returns an archiver and unarchiver, or compressor
-// and decompressor, based on the extension of the filename.
-func ByExtension(filename string) (interface{}, error) {
-	var ec interface{}
-	for _, c := range extCheckers {
-		if err := c.CheckExt(filename); err == nil {
-			ec = c
-			break
-		}
-	}
-	switch ec.(type) {
-	case *Rar:
-		return NewRar(), nil
-	case *Tar:
-		return NewTar(), nil
-	case *TarBrotli:
-		return NewTarBrotli(), nil
-	case *TarBz2:
-		return NewTarBz2(), nil
-	case *TarGz:
-		return NewTarGz(), nil
-	case *TarLz4:
-		return NewTarLz4(), nil
-	case *TarSz:
-		return NewTarSz(), nil
-	case *TarXz:
-		return NewTarXz(), nil
-	case *TarZstd:
-		return NewTarZstd(), nil
-	case *Zip:
-		return NewZip(), nil
-	case *Gz:
-		return NewGz(), nil
-	case *Bz2:
-		return NewBz2(), nil
-	case *Lz4:
-		return NewLz4(), nil
-	case *Snappy:
-		return NewSnappy(), nil
-	case *Xz:
-		return NewXz(), nil
-	case *Zstd:
-		return NewZstd(), nil
-	}
-	return nil, fmt.Errorf("format unrecognized by filename: %s", filename)
-}
-
-// ByHeader returns the unarchiver value that matches the input's
-// file header. It does not affect the current read position.
-// If the file's header is not a recognized archive format, then
-// ErrFormatNotRecognized will be returned.
-func ByHeader(input io.ReadSeeker) (Unarchiver, error) {
-	var matcher Matcher
-	for _, m := range matchers {
-		ok, err := m.Match(input)
-		if err != nil {
-			return nil, fmt.Errorf("matching on format %s: %v", m, err)
-		}
-		if ok {
-			matcher = m
-			break
-		}
-	}
-	switch matcher.(type) {
-	case *Zip:
-		return NewZip(), nil
-	case *Tar:
-		return NewTar(), nil
-	case *Rar:
-		return NewRar(), nil
-	}
-	return nil, ErrFormatNotRecognized
-}
-
-// extCheckers is a list of the format implementations
-// that can check extensions. Only to be used for
-// checking extensions - not any archival operations.
-var extCheckers = []ExtensionChecker{
-	&TarBrotli{},
-	&TarBz2{},
-	&TarGz{},
-	&TarLz4{},
-	&TarSz{},
-	&TarXz{},
-	&TarZstd{},
-	&Rar{},
-	&Tar{},
-	&Zip{},
-	&Brotli{},
-	&Gz{},
-	&Bz2{},
-	&Lz4{},
-	&Snappy{},
-	&Xz{},
-	&Zstd{},
-}
-
-var matchers = []Matcher{
-	&Rar{},
-	&Tar{},
-	&Zip{},
-}
diff --git a/vendor/github.com/mholt/archiver/v3/brotli.go b/vendor/github.com/mholt/archiver/v3/brotli.go
deleted file mode 100644
index d594d66f2d..0000000000
--- a/vendor/github.com/mholt/archiver/v3/brotli.go
+++ /dev/null
@@ -1,55 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"path/filepath"
-
-	"github.com/andybalholm/brotli"
-)
-
-// Brotli facilitates brotli compression.
-type Brotli struct {
-	Quality int
-}
-
-// Compress reads in, compresses it, and writes it to out.
-func (br *Brotli) Compress(in io.Reader, out io.Writer) error {
-	w := brotli.NewWriterLevel(out, br.Quality)
-	defer w.Close()
-	_, err := io.Copy(w, in)
-	return err
-}
-
-// Decompress reads in, decompresses it, and writes it to out.
-func (br *Brotli) Decompress(in io.Reader, out io.Writer) error {
-	r := brotli.NewReader(in)
-	_, err := io.Copy(out, r)
-	return err
-}
-
-// CheckExt ensures the file extension matches the format.
-func (br *Brotli) CheckExt(filename string) error {
-	if filepath.Ext(filename) != ".br" {
-		return fmt.Errorf("filename must have a .br extension")
-	}
-	return nil
-}
-
-func (br *Brotli) String() string { return "brotli" }
-
-// NewBrotli returns a new, default instance ready to be customized and used.
-func NewBrotli() *Brotli {
-	return &Brotli{
-		Quality: brotli.DefaultCompression,
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Compressor(new(Brotli))
-	_ = Decompressor(new(Brotli))
-)
-
-// DefaultBrotli is a default instance that is conveniently ready to use.
-var DefaultBrotli = NewBrotli()
diff --git a/vendor/github.com/mholt/archiver/v3/build.bash b/vendor/github.com/mholt/archiver/v3/build.bash
deleted file mode 100644
index 225ffc2da5..0000000000
--- a/vendor/github.com/mholt/archiver/v3/build.bash
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-set -ex
-
-# This script builds archiver for most common platforms.
-
-export CGO_ENABLED=0
-
-cd cmd/arc
-GOOS=linux   GOARCH=amd64 go build -o ../../builds/arc_linux_amd64
-GOOS=linux   GOARCH=arm   go build -o ../../builds/arc_linux_arm7
-GOOS=darwin  GOARCH=amd64 go build -o ../../builds/arc_mac_amd64
-GOOS=windows GOARCH=amd64 go build -o ../../builds/arc_windows_amd64.exe
-cd ../..
diff --git a/vendor/github.com/mholt/archiver/v3/bz2.go b/vendor/github.com/mholt/archiver/v3/bz2.go
deleted file mode 100644
index 2eb4ac2b88..0000000000
--- a/vendor/github.com/mholt/archiver/v3/bz2.go
+++ /dev/null
@@ -1,64 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"path/filepath"
-
-	"github.com/dsnet/compress/bzip2"
-)
-
-// Bz2 facilitates bzip2 compression.
-type Bz2 struct {
-	CompressionLevel int
-}
-
-// Compress reads in, compresses it, and writes it to out.
-func (bz *Bz2) Compress(in io.Reader, out io.Writer) error {
-	w, err := bzip2.NewWriter(out, &bzip2.WriterConfig{
-		Level: bz.CompressionLevel,
-	})
-	if err != nil {
-		return err
-	}
-	defer w.Close()
-	_, err = io.Copy(w, in)
-	return err
-}
-
-// Decompress reads in, decompresses it, and writes it to out.
-func (bz *Bz2) Decompress(in io.Reader, out io.Writer) error {
-	r, err := bzip2.NewReader(in, nil)
-	if err != nil {
-		return err
-	}
-	defer r.Close()
-	_, err = io.Copy(out, r)
-	return err
-}
-
-// CheckExt ensures the file extension matches the format.
-func (bz *Bz2) CheckExt(filename string) error {
-	if filepath.Ext(filename) != ".bz2" {
-		return fmt.Errorf("filename must have a .bz2 extension")
-	}
-	return nil
-}
-
-func (bz *Bz2) String() string { return "bz2" }
-
-// NewBz2 returns a new, default instance ready to be customized and used.
-func NewBz2() *Bz2 {
-	return &Bz2{
-		CompressionLevel: bzip2.DefaultCompression,
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Compressor(new(Bz2))
-	_ = Decompressor(new(Bz2))
-)
-
-// DefaultBz2 is a default instance that is conveniently ready to use.
-var DefaultBz2 = NewBz2()
diff --git a/vendor/github.com/mholt/archiver/v3/error.go b/vendor/github.com/mholt/archiver/v3/error.go
deleted file mode 100644
index a46235c652..0000000000
--- a/vendor/github.com/mholt/archiver/v3/error.go
+++ /dev/null
@@ -1,27 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"strings"
-)
-
-// IllegalPathError is an error returned when an illegal
-// path is detected during the archival process.
-//
-// By default, only the Filename is showed on error, but you might
-// also get the absolute value of the invalid path on the AbsolutePath
-// field.
-type IllegalPathError struct {
-	AbsolutePath string
-	Filename     string
-}
-
-func (err *IllegalPathError) Error() string {
-	return fmt.Sprintf("illegal file path: %s", err.Filename)
-}
-
-// IsIllegalPathError returns true if the provided error is of
-// the type IllegalPathError.
-func IsIllegalPathError(err error) bool {
-	return err != nil && strings.Contains(err.Error(), "illegal file path: ")
-}
diff --git a/vendor/github.com/mholt/archiver/v3/filecompressor.go b/vendor/github.com/mholt/archiver/v3/filecompressor.go
deleted file mode 100644
index ab1fd3b8c0..0000000000
--- a/vendor/github.com/mholt/archiver/v3/filecompressor.go
+++ /dev/null
@@ -1,67 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"os"
-)
-
-// FileCompressor can compress and decompress single files.
-type FileCompressor struct {
-	Compressor
-	Decompressor
-
-	// Whether to overwrite existing files when creating files.
-	OverwriteExisting bool
-}
-
-// CompressFile reads the source file and compresses it to destination.
-// The destination must have a matching extension.
-func (fc FileCompressor) CompressFile(source, destination string) error {
-	if err := fc.CheckExt(destination); err != nil {
-		return err
-	}
-	if fc.Compressor == nil {
-		return fmt.Errorf("no compressor specified")
-	}
-	if !fc.OverwriteExisting && fileExists(destination) {
-		return fmt.Errorf("file exists: %s", destination)
-	}
-
-	in, err := os.Open(source)
-	if err != nil {
-		return err
-	}
-	defer in.Close()
-
-	out, err := os.Create(destination)
-	if err != nil {
-		return err
-	}
-	defer out.Close()
-
-	return fc.Compress(in, out)
-}
-
-// DecompressFile reads the source file and decompresses it to destination.
-func (fc FileCompressor) DecompressFile(source, destination string) error {
-	if fc.Decompressor == nil {
-		return fmt.Errorf("no decompressor specified")
-	}
-	if !fc.OverwriteExisting && fileExists(destination) {
-		return fmt.Errorf("file exists: %s", destination)
-	}
-
-	in, err := os.Open(source)
-	if err != nil {
-		return err
-	}
-	defer in.Close()
-
-	out, err := os.Create(destination)
-	if err != nil {
-		return err
-	}
-	defer out.Close()
-
-	return fc.Decompress(in, out)
-}
diff --git a/vendor/github.com/mholt/archiver/v3/gz.go b/vendor/github.com/mholt/archiver/v3/gz.go
deleted file mode 100644
index 650718d0f3..0000000000
--- a/vendor/github.com/mholt/archiver/v3/gz.go
+++ /dev/null
@@ -1,76 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"path/filepath"
-
-	"github.com/klauspost/compress/gzip"
-	"github.com/klauspost/pgzip"
-)
-
-// Gz facilitates gzip compression.
-type Gz struct {
-	CompressionLevel int
-	SingleThreaded   bool
-}
-
-// Compress reads in, compresses it, and writes it to out.
-func (gz *Gz) Compress(in io.Reader, out io.Writer) error {
-	var w io.WriteCloser
-	var err error
-	if gz.SingleThreaded {
-		w, err = gzip.NewWriterLevel(out, gz.CompressionLevel)
-	} else {
-		w, err = pgzip.NewWriterLevel(out, gz.CompressionLevel)
-	}
-	if err != nil {
-		return err
-	}
-	defer w.Close()
-	_, err = io.Copy(w, in)
-	return err
-}
-
-// Decompress reads in, decompresses it, and writes it to out.
-func (gz *Gz) Decompress(in io.Reader, out io.Writer) error {
-	var r io.ReadCloser
-	var err error
-	if gz.SingleThreaded {
-		r, err = gzip.NewReader(in)
-	} else {
-		r, err = pgzip.NewReader(in)
-	}
-	if err != nil {
-		return err
-	}
-	defer r.Close()
-	_, err = io.Copy(out, r)
-	return err
-}
-
-// CheckExt ensures the file extension matches the format.
-func (gz *Gz) CheckExt(filename string) error {
-	if filepath.Ext(filename) != ".gz" {
-		return fmt.Errorf("filename must have a .gz extension")
-	}
-	return nil
-}
-
-func (gz *Gz) String() string { return "gz" }
-
-// NewGz returns a new, default instance ready to be customized and used.
-func NewGz() *Gz {
-	return &Gz{
-		CompressionLevel: gzip.DefaultCompression,
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Compressor(new(Gz))
-	_ = Decompressor(new(Gz))
-)
-
-// DefaultGz is a default instance that is conveniently ready to use.
-var DefaultGz = NewGz()
diff --git a/vendor/github.com/mholt/archiver/v3/lz4.go b/vendor/github.com/mholt/archiver/v3/lz4.go
deleted file mode 100644
index 3d6b0a212d..0000000000
--- a/vendor/github.com/mholt/archiver/v3/lz4.go
+++ /dev/null
@@ -1,63 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"path/filepath"
-
-	"github.com/pierrec/lz4/v4"
-)
-
-// Lz4 facilitates LZ4 compression.
-type Lz4 struct {
-	CompressionLevel int
-}
-
-// Compress reads in, compresses it, and writes it to out.
-func (lz *Lz4) Compress(in io.Reader, out io.Writer) error {
-	w := lz4.NewWriter(out)
-	// TODO archiver v4: use proper lz4.Fast
-	// bitshifting for backwards compatibility with lz4/v3
-	options := []lz4.Option{
-		lz4.CompressionLevelOption(lz4.CompressionLevel(1 << (8 + lz.CompressionLevel))),
-	}
-	if err := w.Apply(options...); err != nil {
-		return err
-	}
-	defer w.Close()
-	_, err := io.Copy(w, in)
-	return err
-}
-
-// Decompress reads in, decompresses it, and writes it to out.
-func (lz *Lz4) Decompress(in io.Reader, out io.Writer) error {
-	r := lz4.NewReader(in)
-	_, err := io.Copy(out, r)
-	return err
-}
-
-// CheckExt ensures the file extension matches the format.
-func (lz *Lz4) CheckExt(filename string) error {
-	if filepath.Ext(filename) != ".lz4" {
-		return fmt.Errorf("filename must have a .lz4 extension")
-	}
-	return nil
-}
-
-func (lz *Lz4) String() string { return "lz4" }
-
-// NewLz4 returns a new, default instance ready to be customized and used.
-func NewLz4() *Lz4 {
-	return &Lz4{
-		CompressionLevel: 9, // https://github.com/lz4/lz4/blob/1b819bfd633ae285df2dfe1b0589e1ec064f2873/lib/lz4hc.h#L48
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Compressor(new(Lz4))
-	_ = Decompressor(new(Lz4))
-)
-
-// DefaultLz4 is a default instance that is conveniently ready to use.
-var DefaultLz4 = NewLz4()
diff --git a/vendor/github.com/mholt/archiver/v3/rar.go b/vendor/github.com/mholt/archiver/v3/rar.go
deleted file mode 100644
index 35fd60b676..0000000000
--- a/vendor/github.com/mholt/archiver/v3/rar.go
+++ /dev/null
@@ -1,446 +0,0 @@
-package archiver
-
-import (
-	"bytes"
-	"fmt"
-	"io"
-	"log"
-	"os"
-	"path"
-	"path/filepath"
-	"strings"
-	"time"
-
-	"github.com/nwaples/rardecode"
-)
-
-// Rar provides facilities for reading RAR archives.
-// See https://www.rarlab.com/technote.htm.
-type Rar struct {
-	// Whether to overwrite existing files; if false,
-	// an error is returned if the file exists.
-	OverwriteExisting bool
-
-	// Whether to make all the directories necessary
-	// to create a rar archive in the desired path.
-	MkdirAll bool
-
-	// A single top-level folder can be implicitly
-	// created by the Unarchive method if the files
-	// to be extracted from the archive do not all
-	// have a common root. This roughly mimics the
-	// behavior of archival tools integrated into OS
-	// file browsers which create a subfolder to
-	// avoid unexpectedly littering the destination
-	// folder with potentially many files, causing a
-	// problematic cleanup/organization situation.
-	// This feature is available for both creation
-	// and extraction of archives, but may be slightly
-	// inefficient with lots and lots of files,
-	// especially on extraction.
-	ImplicitTopLevelFolder bool
-
-	// Strip number of leading paths. This feature is available
-	// only during unpacking of the entire archive.
-	StripComponents int
-
-	// If true, errors encountered during reading
-	// or writing a single file will be logged and
-	// the operation will continue on remaining files.
-	ContinueOnError bool
-
-	// The password to open archives (optional).
-	Password string
-
-	rr *rardecode.Reader     // underlying stream reader
-	rc *rardecode.ReadCloser // supports multi-volume archives (files only)
-}
-
-// CheckExt ensures the file extension matches the format.
-func (*Rar) CheckExt(filename string) error {
-	if !strings.HasSuffix(filename, ".rar") {
-		return fmt.Errorf("filename must have a .rar extension")
-	}
-	return nil
-}
-
-// CheckPath ensures that the filename has not been crafted to perform path traversal attacks
-func (*Rar) CheckPath(to, filename string) error {
-	to, _ = filepath.Abs(to) //explicit the destination folder to prevent that 'string.HasPrefix' check can be 'bypassed' when no destination folder is supplied in input
-	dest := filepath.Join(to, filename)
-	//prevent path traversal attacks
-	if !strings.HasPrefix(dest, to) {
-		return &IllegalPathError{AbsolutePath: dest, Filename: filename}
-	}
-	return nil
-}
-
-// Unarchive unpacks the .rar file at source to destination.
-// Destination will be treated as a folder name. It supports
-// multi-volume archives.
-func (r *Rar) Unarchive(source, destination string) error {
-	if !fileExists(destination) && r.MkdirAll {
-		err := mkdir(destination, 0755)
-		if err != nil {
-			return fmt.Errorf("preparing destination: %v", err)
-		}
-	}
-
-	// if the files in the archive do not all share a common
-	// root, then make sure we extract to a single subfolder
-	// rather than potentially littering the destination...
-	if r.ImplicitTopLevelFolder {
-		var err error
-		destination, err = r.addTopLevelFolder(source, destination)
-		if err != nil {
-			return fmt.Errorf("scanning source archive: %v", err)
-		}
-	}
-
-	err := r.OpenFile(source)
-	if err != nil {
-		return fmt.Errorf("opening rar archive for reading: %v", err)
-	}
-	defer r.Close()
-
-	for {
-		err := r.unrarNext(destination)
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			if r.ContinueOnError || IsIllegalPathError(err) {
-				log.Printf("[ERROR] Reading file in rar archive: %v", err)
-				continue
-			}
-			return fmt.Errorf("reading file in rar archive: %v", err)
-		}
-	}
-
-	return nil
-}
-
-// addTopLevelFolder scans the files contained inside
-// the tarball named sourceArchive and returns a modified
-// destination if all the files do not share the same
-// top-level folder.
-func (r *Rar) addTopLevelFolder(sourceArchive, destination string) (string, error) {
-	file, err := os.Open(sourceArchive)
-	if err != nil {
-		return "", fmt.Errorf("opening source archive: %v", err)
-	}
-	defer file.Close()
-
-	rc, err := rardecode.NewReader(file, r.Password)
-	if err != nil {
-		return "", fmt.Errorf("creating archive reader: %v", err)
-	}
-
-	var files []string
-	for {
-		hdr, err := rc.Next()
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			return "", fmt.Errorf("scanning tarball's file listing: %v", err)
-		}
-		files = append(files, hdr.Name)
-	}
-
-	if multipleTopLevels(files) {
-		destination = filepath.Join(destination, folderNameFromFileName(sourceArchive))
-	}
-
-	return destination, nil
-}
-
-func (r *Rar) unrarNext(to string) error {
-	f, err := r.Read()
-	if err != nil {
-		return err // don't wrap error; calling loop must break on io.EOF
-	}
-	defer f.Close()
-
-	header, ok := f.Header.(*rardecode.FileHeader)
-	if !ok {
-		return fmt.Errorf("expected header to be *rardecode.FileHeader but was %T", f.Header)
-	}
-
-	errPath := r.CheckPath(to, header.Name)
-	if errPath != nil {
-		return fmt.Errorf("checking path traversal attempt: %v", errPath)
-	}
-
-	if r.StripComponents > 0 {
-		if strings.Count(header.Name, "/") < r.StripComponents {
-			return nil // skip path with fewer components
-		}
-
-		for i := 0; i < r.StripComponents; i++ {
-			slash := strings.Index(header.Name, "/")
-			header.Name = header.Name[slash+1:]
-		}
-	}
-
-	return r.unrarFile(f, filepath.Join(to, header.Name))
-}
-
-func (r *Rar) unrarFile(f File, to string) error {
-	// do not overwrite existing files, if configured
-	if !f.IsDir() && !r.OverwriteExisting && fileExists(to) {
-		return fmt.Errorf("file already exists: %s", to)
-	}
-
-	hdr, ok := f.Header.(*rardecode.FileHeader)
-	if !ok {
-		return fmt.Errorf("expected header to be *rardecode.FileHeader but was %T", f.Header)
-	}
-
-	if f.IsDir() {
-		if fileExists("testdata") {
-			err := os.Chmod(to, hdr.Mode())
-			if err != nil {
-				return fmt.Errorf("changing dir mode: %v", err)
-			}
-		} else {
-			err := mkdir(to, hdr.Mode())
-			if err != nil {
-				return fmt.Errorf("making directories: %v", err)
-			}
-		}
-		return nil
-	}
-
-	// if files come before their containing folders, then we must
-	// create their folders before writing the file
-	err := mkdir(filepath.Dir(to), 0755)
-	if err != nil {
-		return fmt.Errorf("making parent directories: %v", err)
-	}
-
-	if (hdr.Mode() & os.ModeSymlink) != 0 {
-		return nil
-	}
-
-	return writeNewFile(to, r.rr, hdr.Mode())
-}
-
-// OpenFile opens filename for reading. This method supports
-// multi-volume archives, whereas Open does not (but Open
-// supports any stream, not just files).
-func (r *Rar) OpenFile(filename string) error {
-	if r.rr != nil {
-		return fmt.Errorf("rar archive is already open for reading")
-	}
-	var err error
-	r.rc, err = rardecode.OpenReader(filename, r.Password)
-	if err != nil {
-		return err
-	}
-	r.rr = &r.rc.Reader
-	return nil
-}
-
-// Open opens t for reading an archive from
-// in. The size parameter is not used.
-func (r *Rar) Open(in io.Reader, size int64) error {
-	if r.rr != nil {
-		return fmt.Errorf("rar archive is already open for reading")
-	}
-	var err error
-	r.rr, err = rardecode.NewReader(in, r.Password)
-	return err
-}
-
-// Read reads the next file from t, which must have
-// already been opened for reading. If there are no
-// more files, the error is io.EOF. The File must
-// be closed when finished reading from it.
-func (r *Rar) Read() (File, error) {
-	if r.rr == nil {
-		return File{}, fmt.Errorf("rar archive is not open")
-	}
-
-	hdr, err := r.rr.Next()
-	if err != nil {
-		return File{}, err // don't wrap error; preserve io.EOF
-	}
-
-	file := File{
-		FileInfo:   rarFileInfo{hdr},
-		Header:     hdr,
-		ReadCloser: ReadFakeCloser{r.rr},
-	}
-
-	return file, nil
-}
-
-// Close closes the rar archive(s) opened by Create and Open.
-func (r *Rar) Close() error {
-	var err error
-	if r.rc != nil {
-		rc := r.rc
-		r.rc = nil
-		err = rc.Close()
-	}
-	if r.rr != nil {
-		r.rr = nil
-	}
-	return err
-}
-
-// Walk calls walkFn for each visited item in archive.
-func (r *Rar) Walk(archive string, walkFn WalkFunc) error {
-	file, err := os.Open(archive)
-	if err != nil {
-		return fmt.Errorf("opening archive file: %v", err)
-	}
-	defer file.Close()
-
-	err = r.Open(file, 0)
-	if err != nil {
-		return fmt.Errorf("opening archive: %v", err)
-	}
-	defer r.Close()
-
-	for {
-		f, err := r.Read()
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			if r.ContinueOnError {
-				log.Printf("[ERROR] Opening next file: %v", err)
-				continue
-			}
-			return fmt.Errorf("opening next file: %v", err)
-		}
-		err = walkFn(f)
-		if err != nil {
-			if err == ErrStopWalk {
-				break
-			}
-			if r.ContinueOnError {
-				log.Printf("[ERROR] Walking %s: %v", f.Name(), err)
-				continue
-			}
-			return fmt.Errorf("walking %s: %v", f.Name(), err)
-		}
-	}
-
-	return nil
-}
-
-// Extract extracts a single file from the rar archive.
-// If the target is a directory, the entire folder will
-// be extracted into destination.
-func (r *Rar) Extract(source, target, destination string) error {
-	// target refers to a path inside the archive, which should be clean also
-	target = path.Clean(target)
-
-	// if the target ends up being a directory, then
-	// we will continue walking and extracting files
-	// until we are no longer within that directory
-	var targetDirPath string
-
-	return r.Walk(source, func(f File) error {
-		th, ok := f.Header.(*rardecode.FileHeader)
-		if !ok {
-			return fmt.Errorf("expected header to be *rardecode.FileHeader but was %T", f.Header)
-		}
-
-		// importantly, cleaning the path strips tailing slash,
-		// which must be appended to folders within the archive
-		name := path.Clean(th.Name)
-		if f.IsDir() && target == name {
-			targetDirPath = path.Dir(name)
-		}
-
-		if within(target, th.Name) {
-			// either this is the exact file we want, or is
-			// in the directory we want to extract
-
-			// build the filename we will extract to
-			end, err := filepath.Rel(targetDirPath, th.Name)
-			if err != nil {
-				return fmt.Errorf("relativizing paths: %v", err)
-			}
-			joined := filepath.Join(destination, end)
-
-			err = r.unrarFile(f, joined)
-			if err != nil {
-				return fmt.Errorf("extracting file %s: %v", th.Name, err)
-			}
-
-			// if our target was not a directory, stop walk
-			if targetDirPath == "" {
-				return ErrStopWalk
-			}
-		} else if targetDirPath != "" {
-			// finished walking the entire directory
-			return ErrStopWalk
-		}
-
-		return nil
-	})
-}
-
-// Match returns true if the format of file matches this
-// type's format. It should not affect reader position.
-func (*Rar) Match(file io.ReadSeeker) (bool, error) {
-	currentPos, err := file.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return false, err
-	}
-	_, err = file.Seek(0, 0)
-	if err != nil {
-		return false, err
-	}
-	defer func() {
-		_, _ = file.Seek(currentPos, io.SeekStart)
-	}()
-
-	buf := make([]byte, 8)
-	if n, err := file.Read(buf); err != nil || n < 8 {
-		return false, nil
-	}
-	hasRarHeader := bytes.Equal(buf[:7], []byte("Rar!\x1a\x07\x00")) || // ver 1.5
-		bytes.Equal(buf, []byte("Rar!\x1a\x07\x01\x00")) // ver 5.0
-	return hasRarHeader, nil
-}
-
-func (r *Rar) String() string { return "rar" }
-
-// NewRar returns a new, default instance ready to be customized and used.
-func NewRar() *Rar {
-	return &Rar{
-		MkdirAll: true,
-	}
-}
-
-type rarFileInfo struct {
-	fh *rardecode.FileHeader
-}
-
-func (rfi rarFileInfo) Name() string       { return rfi.fh.Name }
-func (rfi rarFileInfo) Size() int64        { return rfi.fh.UnPackedSize }
-func (rfi rarFileInfo) Mode() os.FileMode  { return rfi.fh.Mode() }
-func (rfi rarFileInfo) ModTime() time.Time { return rfi.fh.ModificationTime }
-func (rfi rarFileInfo) IsDir() bool        { return rfi.fh.IsDir }
-func (rfi rarFileInfo) Sys() interface{}   { return nil }
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Reader(new(Rar))
-	_ = Unarchiver(new(Rar))
-	_ = Walker(new(Rar))
-	_ = Extractor(new(Rar))
-	_ = Matcher(new(Rar))
-	_ = ExtensionChecker(new(Rar))
-	_ = FilenameChecker(new(Rar))
-	_ = os.FileInfo(rarFileInfo{})
-)
-
-// DefaultRar is a default instance that is conveniently ready to use.
-var DefaultRar = NewRar()
diff --git a/vendor/github.com/mholt/archiver/v3/sz.go b/vendor/github.com/mholt/archiver/v3/sz.go
deleted file mode 100644
index 02009b528f..0000000000
--- a/vendor/github.com/mholt/archiver/v3/sz.go
+++ /dev/null
@@ -1,51 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"path/filepath"
-
-	"github.com/golang/snappy"
-)
-
-// Snappy facilitates Snappy compression.
-type Snappy struct{}
-
-// Compress reads in, compresses it, and writes it to out.
-func (s *Snappy) Compress(in io.Reader, out io.Writer) error {
-	w := snappy.NewBufferedWriter(out)
-	defer w.Close()
-	_, err := io.Copy(w, in)
-	return err
-}
-
-// Decompress reads in, decompresses it, and writes it to out.
-func (s *Snappy) Decompress(in io.Reader, out io.Writer) error {
-	r := snappy.NewReader(in)
-	_, err := io.Copy(out, r)
-	return err
-}
-
-// CheckExt ensures the file extension matches the format.
-func (s *Snappy) CheckExt(filename string) error {
-	if filepath.Ext(filename) != ".sz" {
-		return fmt.Errorf("filename must have a .sz extension")
-	}
-	return nil
-}
-
-func (s *Snappy) String() string { return "sz" }
-
-// NewSnappy returns a new, default instance ready to be customized and used.
-func NewSnappy() *Snappy {
-	return new(Snappy)
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Compressor(new(Snappy))
-	_ = Decompressor(new(Snappy))
-)
-
-// DefaultSnappy is a default instance that is conveniently ready to use.
-var DefaultSnappy = NewSnappy()
diff --git a/vendor/github.com/mholt/archiver/v3/tar.go b/vendor/github.com/mholt/archiver/v3/tar.go
deleted file mode 100644
index be8986657d..0000000000
--- a/vendor/github.com/mholt/archiver/v3/tar.go
+++ /dev/null
@@ -1,659 +0,0 @@
-package archiver
-
-import (
-	"archive/tar"
-	"bytes"
-	"fmt"
-	"io"
-	"log"
-	"os"
-	"path"
-	"path/filepath"
-	"strconv"
-	"strings"
-)
-
-// Tar provides facilities for operating TAR archives.
-// See http://www.gnu.org/software/tar/manual/html_node/Standard.html.
-type Tar struct {
-	// Whether to overwrite existing files; if false,
-	// an error is returned if the file exists.
-	OverwriteExisting bool
-
-	// Whether to make all the directories necessary
-	// to create a tar archive in the desired path.
-	MkdirAll bool
-
-	// A single top-level folder can be implicitly
-	// created by the Archive or Unarchive methods
-	// if the files to be added to the archive
-	// or the files to be extracted from the archive
-	// do not all have a common root. This roughly
-	// mimics the behavior of archival tools integrated
-	// into OS file browsers which create a subfolder
-	// to avoid unexpectedly littering the destination
-	// folder with potentially many files, causing a
-	// problematic cleanup/organization situation.
-	// This feature is available for both creation
-	// and extraction of archives, but may be slightly
-	// inefficient with lots and lots of files,
-	// especially on extraction.
-	ImplicitTopLevelFolder bool
-
-	// Strip number of leading paths. This feature is available
-	// only during unpacking of the entire archive.
-	StripComponents int
-
-	// If true, errors encountered during reading
-	// or writing a single file will be logged and
-	// the operation will continue on remaining files.
-	ContinueOnError bool
-
-	tw *tar.Writer
-	tr *tar.Reader
-
-	readerWrapFn  func(io.Reader) (io.Reader, error)
-	writerWrapFn  func(io.Writer) (io.Writer, error)
-	cleanupWrapFn func()
-}
-
-// CheckExt ensures the file extension matches the format.
-func (*Tar) CheckExt(filename string) error {
-	if !strings.HasSuffix(filename, ".tar") {
-		return fmt.Errorf("filename must have a .tar extension")
-	}
-	return nil
-}
-
-// CheckPath ensures that the filename has not been crafted to perform path traversal attacks
-func (*Tar) CheckPath(to, filename string) error {
-	to, _ = filepath.Abs(to) //explicit the destination folder to prevent that 'string.HasPrefix' check can be 'bypassed' when no destination folder is supplied in input
-	dest := filepath.Join(to, filename)
-	//prevent path traversal attacks
-	if !strings.HasPrefix(dest, to) {
-		return &IllegalPathError{AbsolutePath: dest, Filename: filename}
-	}
-	return nil
-}
-
-// Archive creates a tarball file at destination containing
-// the files listed in sources. The destination must end with
-// ".tar". File paths can be those of regular files or
-// directories; directories will be recursively added.
-func (t *Tar) Archive(sources []string, destination string) error {
-	err := t.CheckExt(destination)
-	if t.writerWrapFn == nil && err != nil {
-		return fmt.Errorf("checking extension: %v", err)
-	}
-	if !t.OverwriteExisting && fileExists(destination) {
-		return fmt.Errorf("file already exists: %s", destination)
-	}
-
-	// make the folder to contain the resulting archive
-	// if it does not already exist
-	destDir := filepath.Dir(destination)
-	if t.MkdirAll && !fileExists(destDir) {
-		err := mkdir(destDir, 0755)
-		if err != nil {
-			return fmt.Errorf("making folder for destination: %v", err)
-		}
-	}
-
-	out, err := os.Create(destination)
-	if err != nil {
-		return fmt.Errorf("creating %s: %v", destination, err)
-	}
-	defer out.Close()
-
-	err = t.Create(out)
-	if err != nil {
-		return fmt.Errorf("creating tar: %v", err)
-	}
-	defer t.Close()
-
-	var topLevelFolder string
-	if t.ImplicitTopLevelFolder && multipleTopLevels(sources) {
-		topLevelFolder = folderNameFromFileName(destination)
-	}
-
-	for _, source := range sources {
-		err := t.writeWalk(source, topLevelFolder, destination)
-		if err != nil {
-			return fmt.Errorf("walking %s: %v", source, err)
-		}
-	}
-
-	return nil
-}
-
-// Unarchive unpacks the .tar file at source to destination.
-// Destination will be treated as a folder name.
-func (t *Tar) Unarchive(source, destination string) error {
-	if !fileExists(destination) && t.MkdirAll {
-		err := mkdir(destination, 0755)
-		if err != nil {
-			return fmt.Errorf("preparing destination: %v", err)
-		}
-	}
-
-	// if the files in the archive do not all share a common
-	// root, then make sure we extract to a single subfolder
-	// rather than potentially littering the destination...
-	if t.ImplicitTopLevelFolder {
-		var err error
-		destination, err = t.addTopLevelFolder(source, destination)
-		if err != nil {
-			return fmt.Errorf("scanning source archive: %v", err)
-		}
-	}
-
-	file, err := os.Open(source)
-	if err != nil {
-		return fmt.Errorf("opening source archive: %v", err)
-	}
-	defer file.Close()
-
-	err = t.Open(file, 0)
-	if err != nil {
-		return fmt.Errorf("opening tar archive for reading: %v", err)
-	}
-	defer t.Close()
-
-	for {
-		err := t.untarNext(destination)
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			if t.ContinueOnError || IsIllegalPathError(err) {
-				log.Printf("[ERROR] Reading file in tar archive: %v", err)
-				continue
-			}
-			return fmt.Errorf("reading file in tar archive: %v", err)
-		}
-	}
-
-	return nil
-}
-
-// addTopLevelFolder scans the files contained inside
-// the tarball named sourceArchive and returns a modified
-// destination if all the files do not share the same
-// top-level folder.
-func (t *Tar) addTopLevelFolder(sourceArchive, destination string) (string, error) {
-	file, err := os.Open(sourceArchive)
-	if err != nil {
-		return "", fmt.Errorf("opening source archive: %v", err)
-	}
-	defer file.Close()
-
-	// if the reader is to be wrapped, ensure we do that now
-	// or we will not be able to read the archive successfully
-	reader := io.Reader(file)
-	if t.readerWrapFn != nil {
-		reader, err = t.readerWrapFn(reader)
-		if err != nil {
-			return "", fmt.Errorf("wrapping reader: %v", err)
-		}
-	}
-	if t.cleanupWrapFn != nil {
-		defer t.cleanupWrapFn()
-	}
-
-	tr := tar.NewReader(reader)
-
-	var files []string
-	for {
-		hdr, err := tr.Next()
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			return "", fmt.Errorf("scanning tarball's file listing: %v", err)
-		}
-		files = append(files, hdr.Name)
-	}
-
-	if multipleTopLevels(files) {
-		destination = filepath.Join(destination, folderNameFromFileName(sourceArchive))
-	}
-
-	return destination, nil
-}
-
-func (t *Tar) untarNext(destination string) error {
-	f, err := t.Read()
-	if err != nil {
-		return err // don't wrap error; calling loop must break on io.EOF
-	}
-	defer f.Close()
-
-	header, ok := f.Header.(*tar.Header)
-	if !ok {
-		return fmt.Errorf("expected header to be *tar.Header but was %T", f.Header)
-	}
-
-	errPath := t.CheckPath(destination, header.Name)
-	if errPath != nil {
-		return fmt.Errorf("checking path traversal attempt: %v", errPath)
-	}
-
-	if t.StripComponents > 0 {
-		if strings.Count(header.Name, "/") < t.StripComponents {
-			return nil // skip path with fewer components
-		}
-
-		for i := 0; i < t.StripComponents; i++ {
-			slash := strings.Index(header.Name, "/")
-			header.Name = header.Name[slash+1:]
-		}
-	}
-	return t.untarFile(f, destination, header)
-}
-
-func (t *Tar) untarFile(f File, destination string, hdr *tar.Header) error {
-	to := filepath.Join(destination, hdr.Name)
-
-	// do not overwrite existing files, if configured
-	if !f.IsDir() && !t.OverwriteExisting && fileExists(to) {
-		return fmt.Errorf("file already exists: %s", to)
-	}
-
-	switch hdr.Typeflag {
-	case tar.TypeDir:
-		return mkdir(to, f.Mode())
-	case tar.TypeReg, tar.TypeRegA, tar.TypeChar, tar.TypeBlock, tar.TypeFifo, tar.TypeGNUSparse:
-		return writeNewFile(to, f, f.Mode())
-	case tar.TypeSymlink:
-		return writeNewSymbolicLink(to, hdr.Linkname)
-	case tar.TypeLink:
-		return writeNewHardLink(to, filepath.Join(destination, hdr.Linkname))
-	case tar.TypeXGlobalHeader:
-		return nil // ignore the pax global header from git-generated tarballs
-	default:
-		return fmt.Errorf("%s: unknown type flag: %c", hdr.Name, hdr.Typeflag)
-	}
-}
-
-func (t *Tar) writeWalk(source, topLevelFolder, destination string) error {
-	sourceInfo, err := os.Stat(source)
-	if err != nil {
-		return fmt.Errorf("%s: stat: %v", source, err)
-	}
-	destAbs, err := filepath.Abs(destination)
-	if err != nil {
-		return fmt.Errorf("%s: getting absolute path of destination %s: %v", source, destination, err)
-	}
-
-	return filepath.Walk(source, func(fpath string, info os.FileInfo, err error) error {
-		handleErr := func(err error) error {
-			if t.ContinueOnError {
-				log.Printf("[ERROR] Walking %s: %v", fpath, err)
-				return nil
-			}
-			return err
-		}
-		if err != nil {
-			return handleErr(fmt.Errorf("traversing %s: %v", fpath, err))
-		}
-		if info == nil {
-			return handleErr(fmt.Errorf("no file info"))
-		}
-
-		// make sure we do not copy our output file into itself
-		fpathAbs, err := filepath.Abs(fpath)
-		if err != nil {
-			return handleErr(fmt.Errorf("%s: getting absolute path: %v", fpath, err))
-		}
-		if within(fpathAbs, destAbs) {
-			return nil
-		}
-
-		// build the name to be used within the archive
-		nameInArchive, err := makeNameInArchive(sourceInfo, source, topLevelFolder, fpath)
-		if err != nil {
-			return handleErr(err)
-		}
-
-		var file io.ReadCloser
-		if info.Mode().IsRegular() {
-			file, err = os.Open(fpath)
-			if err != nil {
-				return handleErr(fmt.Errorf("%s: opening: %v", fpath, err))
-			}
-			defer file.Close()
-		}
-		err = t.Write(File{
-			FileInfo: FileInfo{
-				FileInfo:   info,
-				CustomName: nameInArchive,
-				SourcePath: fpath,
-			},
-			ReadCloser: file,
-		})
-		if err != nil {
-			return handleErr(fmt.Errorf("%s: writing: %s", fpath, err))
-		}
-
-		return nil
-	})
-}
-
-// Create opens t for writing a tar archive to out.
-func (t *Tar) Create(out io.Writer) error {
-	if t.tw != nil {
-		return fmt.Errorf("tar archive is already created for writing")
-	}
-
-	// wrapping writers allows us to output
-	// compressed tarballs, for example
-	if t.writerWrapFn != nil {
-		var err error
-		out, err = t.writerWrapFn(out)
-		if err != nil {
-			return fmt.Errorf("wrapping writer: %v", err)
-		}
-	}
-
-	t.tw = tar.NewWriter(out)
-	return nil
-}
-
-// Write writes f to t, which must have been opened for writing first.
-func (t *Tar) Write(f File) error {
-	if t.tw == nil {
-		return fmt.Errorf("tar archive was not created for writing first")
-	}
-	if f.FileInfo == nil {
-		return fmt.Errorf("no file info")
-	}
-	if f.FileInfo.Name() == "" {
-		return fmt.Errorf("missing file name")
-	}
-
-	var linkTarget string
-	if isSymlink(f) {
-		fi, ok := f.FileInfo.(FileInfo)
-		if !ok {
-			return fmt.Errorf("failed to cast fs.FileInfo to archiver.FileInfo: %v", f)
-		}
-		var err error
-		linkTarget, err = os.Readlink(fi.SourcePath)
-		if err != nil {
-			return fmt.Errorf("%s: readlink: %v", fi.SourcePath, err)
-		}
-	}
-
-	hdr, err := tar.FileInfoHeader(f, filepath.ToSlash(linkTarget))
-	if err != nil {
-		return fmt.Errorf("%s: making header: %v", f.Name(), err)
-	}
-
-	err = t.tw.WriteHeader(hdr)
-	if err != nil {
-		return fmt.Errorf("%s: writing header: %w", hdr.Name, err)
-	}
-
-	if f.IsDir() {
-		return nil // directories have no contents
-	}
-
-	if hdr.Typeflag == tar.TypeReg {
-		if f.ReadCloser == nil {
-			return fmt.Errorf("%s: no way to read file contents", f.Name())
-		}
-		_, err := io.Copy(t.tw, f)
-		if err != nil {
-			return fmt.Errorf("%s: copying contents: %w", f.Name(), err)
-		}
-	}
-
-	return nil
-}
-
-// Open opens t for reading an archive from
-// in. The size parameter is not used.
-func (t *Tar) Open(in io.Reader, size int64) error {
-	if t.tr != nil {
-		return fmt.Errorf("tar archive is already open for reading")
-	}
-	// wrapping readers allows us to open compressed tarballs
-	if t.readerWrapFn != nil {
-		var err error
-		in, err = t.readerWrapFn(in)
-		if err != nil {
-			return fmt.Errorf("wrapping file reader: %v", err)
-		}
-	}
-	t.tr = tar.NewReader(in)
-	return nil
-}
-
-// Read reads the next file from t, which must have
-// already been opened for reading. If there are no
-// more files, the error is io.EOF. The File must
-// be closed when finished reading from it.
-func (t *Tar) Read() (File, error) {
-	if t.tr == nil {
-		return File{}, fmt.Errorf("tar archive is not open")
-	}
-
-	hdr, err := t.tr.Next()
-	if err != nil {
-		return File{}, err // don't wrap error; preserve io.EOF
-	}
-
-	file := File{
-		FileInfo:   hdr.FileInfo(),
-		Header:     hdr,
-		ReadCloser: ReadFakeCloser{t.tr},
-	}
-
-	return file, nil
-}
-
-// Close closes the tar archive(s) opened by Create and Open.
-func (t *Tar) Close() error {
-	var err error
-	if t.tr != nil {
-		t.tr = nil
-	}
-	if t.tw != nil {
-		tw := t.tw
-		t.tw = nil
-		err = tw.Close()
-	}
-	// make sure cleanup of "Reader/Writer wrapper"
-	// (say that ten times fast) happens AFTER the
-	// underlying stream is closed
-	if t.cleanupWrapFn != nil {
-		t.cleanupWrapFn()
-	}
-	return err
-}
-
-// Walk calls walkFn for each visited item in archive.
-func (t *Tar) Walk(archive string, walkFn WalkFunc) error {
-	file, err := os.Open(archive)
-	if err != nil {
-		return fmt.Errorf("opening archive file: %v", err)
-	}
-	defer file.Close()
-
-	err = t.Open(file, 0)
-	if err != nil {
-		return fmt.Errorf("opening archive: %v", err)
-	}
-	defer t.Close()
-
-	for {
-		f, err := t.Read()
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			if t.ContinueOnError {
-				log.Printf("[ERROR] Opening next file: %v", err)
-				continue
-			}
-			return fmt.Errorf("opening next file: %v", err)
-		}
-		err = walkFn(f)
-		if err != nil {
-			if err == ErrStopWalk {
-				break
-			}
-			if t.ContinueOnError {
-				log.Printf("[ERROR] Walking %s: %v", f.Name(), err)
-				continue
-			}
-			return fmt.Errorf("walking %s: %v", f.Name(), err)
-		}
-	}
-
-	return nil
-}
-
-// Extract extracts a single file from the tar archive.
-// If the target is a directory, the entire folder will
-// be extracted into destination.
-func (t *Tar) Extract(source, target, destination string) error {
-	// target refers to a path inside the archive, which should be clean also
-	target = path.Clean(target)
-
-	// if the target ends up being a directory, then
-	// we will continue walking and extracting files
-	// until we are no longer within that directory
-	var targetDirPath string
-
-	return t.Walk(source, func(f File) error {
-		th, ok := f.Header.(*tar.Header)
-		if !ok {
-			return fmt.Errorf("expected header to be *tar.Header but was %T", f.Header)
-		}
-
-		// importantly, cleaning the path strips tailing slash,
-		// which must be appended to folders within the archive
-		name := path.Clean(th.Name)
-		if f.IsDir() && target == name {
-			targetDirPath = path.Dir(name)
-		}
-
-		if within(target, th.Name) {
-			// either this is the exact file we want, or is
-			// in the directory we want to extract
-
-			// build the filename we will extract to
-			end, err := filepath.Rel(targetDirPath, th.Name)
-			if err != nil {
-				return fmt.Errorf("relativizing paths: %v", err)
-			}
-			th.Name = end
-
-			// relativize any hardlink names
-			if th.Typeflag == tar.TypeLink {
-				th.Linkname = filepath.Join(filepath.Base(filepath.Dir(th.Linkname)), filepath.Base(th.Linkname))
-			}
-
-			err = t.untarFile(f, destination, th)
-			if err != nil {
-				return fmt.Errorf("extracting file %s: %v", th.Name, err)
-			}
-
-			// if our target was not a directory, stop walk
-			if targetDirPath == "" {
-				return ErrStopWalk
-			}
-		} else if targetDirPath != "" {
-			// finished walking the entire directory
-			return ErrStopWalk
-		}
-
-		return nil
-	})
-}
-
-// Match returns true if the format of file matches this
-// type's format. It should not affect reader position.
-func (*Tar) Match(file io.ReadSeeker) (bool, error) {
-	currentPos, err := file.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return false, err
-	}
-	_, err = file.Seek(0, 0)
-	if err != nil {
-		return false, err
-	}
-	defer func() {
-		_, _ = file.Seek(currentPos, io.SeekStart)
-	}()
-
-	buf := make([]byte, tarBlockSize)
-	if _, err = io.ReadFull(file, buf); err != nil {
-		return false, nil
-	}
-	return hasTarHeader(buf), nil
-}
-
-// hasTarHeader checks passed bytes has a valid tar header or not. buf must
-// contain at least 512 bytes and if not, it always returns false.
-func hasTarHeader(buf []byte) bool {
-	if len(buf) < tarBlockSize {
-		return false
-	}
-
-	b := buf[148:156]
-	b = bytes.Trim(b, " \x00") // clean up all spaces and null bytes
-	if len(b) == 0 {
-		return false // unknown format
-	}
-	hdrSum, err := strconv.ParseUint(string(b), 8, 64)
-	if err != nil {
-		return false
-	}
-
-	// According to the go official archive/tar, Sun tar uses signed byte
-	// values so this calcs both signed and unsigned
-	var usum uint64
-	var sum int64
-	for i, c := range buf {
-		if 148 <= i && i < 156 {
-			c = ' ' // checksum field itself is counted as branks
-		}
-		usum += uint64(uint8(c))
-		sum += int64(int8(c))
-	}
-
-	if hdrSum != usum && int64(hdrSum) != sum {
-		return false // invalid checksum
-	}
-
-	return true
-}
-
-func (t *Tar) String() string { return "tar" }
-
-// NewTar returns a new, default instance ready to be customized and used.
-func NewTar() *Tar {
-	return &Tar{
-		MkdirAll: true,
-	}
-}
-
-const tarBlockSize = 512
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Reader(new(Tar))
-	_ = Writer(new(Tar))
-	_ = Archiver(new(Tar))
-	_ = Unarchiver(new(Tar))
-	_ = Walker(new(Tar))
-	_ = Extractor(new(Tar))
-	_ = Matcher(new(Tar))
-	_ = ExtensionChecker(new(Tar))
-	_ = FilenameChecker(new(Tar))
-)
-
-// DefaultTar is a default instance that is conveniently ready to use.
-var DefaultTar = NewTar()
diff --git a/vendor/github.com/mholt/archiver/v3/tarbrotli.go b/vendor/github.com/mholt/archiver/v3/tarbrotli.go
deleted file mode 100644
index 83a455d66a..0000000000
--- a/vendor/github.com/mholt/archiver/v3/tarbrotli.go
+++ /dev/null
@@ -1,114 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"strings"
-
-	"github.com/andybalholm/brotli"
-)
-
-// TarBrotli facilitates brotli compression of tarball archives.
-type TarBrotli struct {
-	*Tar
-	Quality int
-}
-
-// CheckExt ensures the file extension matches the format.
-func (*TarBrotli) CheckExt(filename string) error {
-	if !strings.HasSuffix(filename, ".tar.br") &&
-		!strings.HasSuffix(filename, ".tbr") {
-		return fmt.Errorf("filename must have a .tar.br or .tbr extension")
-	}
-	return nil
-}
-
-// Archive creates a compressed tar file at destination
-// containing the files listed in sources. The destination
-// must end with ".tar.br" or ".tbr". File paths can be
-// those of regular files or directories; directories will
-// be recursively added.
-func (tbr *TarBrotli) Archive(sources []string, destination string) error {
-	err := tbr.CheckExt(destination)
-	if err != nil {
-		return fmt.Errorf("output %s", err.Error())
-	}
-	tbr.wrapWriter()
-	return tbr.Tar.Archive(sources, destination)
-}
-
-// Unarchive unpacks the compressed tarball at
-// source to destination. Destination will be
-// treated as a folder name.
-func (tbr *TarBrotli) Unarchive(source, destination string) error {
-	tbr.wrapReader()
-	return tbr.Tar.Unarchive(source, destination)
-}
-
-// Walk calls walkFn for each visited item in archive.
-func (tbr *TarBrotli) Walk(archive string, walkFn WalkFunc) error {
-	tbr.wrapReader()
-	return tbr.Tar.Walk(archive, walkFn)
-}
-
-// Create opens txz for writing a compressed
-// tar archive to out.
-func (tbr *TarBrotli) Create(out io.Writer) error {
-	tbr.wrapWriter()
-	return tbr.Tar.Create(out)
-}
-
-// Open opens t for reading a compressed archive from
-// in. The size parameter is not used.
-func (tbr *TarBrotli) Open(in io.Reader, size int64) error {
-	tbr.wrapReader()
-	return tbr.Tar.Open(in, size)
-}
-
-// Extract extracts a single file from the tar archive.
-// If the target is a directory, the entire folder will
-// be extracted into destination.
-func (tbr *TarBrotli) Extract(source, target, destination string) error {
-	tbr.wrapReader()
-	return tbr.Tar.Extract(source, target, destination)
-}
-
-func (tbr *TarBrotli) wrapWriter() {
-	var brw *brotli.Writer
-	tbr.Tar.writerWrapFn = func(w io.Writer) (io.Writer, error) {
-		brw = brotli.NewWriterLevel(w, tbr.Quality)
-		return brw, nil
-	}
-	tbr.Tar.cleanupWrapFn = func() {
-		brw.Close()
-	}
-}
-
-func (tbr *TarBrotli) wrapReader() {
-	tbr.Tar.readerWrapFn = func(r io.Reader) (io.Reader, error) {
-		return brotli.NewReader(r), nil
-	}
-}
-
-func (tbr *TarBrotli) String() string { return "tar.br" }
-
-// NewTarBrotli returns a new, default instance ready to be customized and used.
-func NewTarBrotli() *TarBrotli {
-	return &TarBrotli{
-		Tar:     NewTar(),
-		Quality: brotli.DefaultCompression,
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Reader(new(TarBrotli))
-	_ = Writer(new(TarBrotli))
-	_ = Archiver(new(TarBrotli))
-	_ = Unarchiver(new(TarBrotli))
-	_ = Walker(new(TarBrotli))
-	_ = Extractor(new(TarBrotli))
-)
-
-// DefaultTarBrotli is a convenient archiver ready to use.
-var DefaultTarBrotli = NewTarBrotli()
diff --git a/vendor/github.com/mholt/archiver/v3/tarbz2.go b/vendor/github.com/mholt/archiver/v3/tarbz2.go
deleted file mode 100644
index e5870a7d29..0000000000
--- a/vendor/github.com/mholt/archiver/v3/tarbz2.go
+++ /dev/null
@@ -1,126 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"strings"
-
-	"github.com/dsnet/compress/bzip2"
-)
-
-// TarBz2 facilitates bzip2 compression
-// (https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf)
-// of tarball archives.
-type TarBz2 struct {
-	*Tar
-
-	CompressionLevel int
-}
-
-// CheckExt ensures the file extension matches the format.
-func (*TarBz2) CheckExt(filename string) error {
-	if !strings.HasSuffix(filename, ".tar.bz2") &&
-		!strings.HasSuffix(filename, ".tbz2") {
-		return fmt.Errorf("filename must have a .tar.bz2 or .tbz2 extension")
-	}
-	return nil
-}
-
-// Archive creates a compressed tar file at destination
-// containing the files listed in sources. The destination
-// must end with ".tar.bz2" or ".tbz2". File paths can be
-// those of regular files or directories; directories will
-// be recursively added.
-func (tbz2 *TarBz2) Archive(sources []string, destination string) error {
-	err := tbz2.CheckExt(destination)
-	if err != nil {
-		return fmt.Errorf("output %s", err.Error())
-	}
-	tbz2.wrapWriter()
-	return tbz2.Tar.Archive(sources, destination)
-}
-
-// Unarchive unpacks the compressed tarball at
-// source to destination. Destination will be
-// treated as a folder name.
-func (tbz2 *TarBz2) Unarchive(source, destination string) error {
-	tbz2.wrapReader()
-	return tbz2.Tar.Unarchive(source, destination)
-}
-
-// Walk calls walkFn for each visited item in archive.
-func (tbz2 *TarBz2) Walk(archive string, walkFn WalkFunc) error {
-	tbz2.wrapReader()
-	return tbz2.Tar.Walk(archive, walkFn)
-}
-
-// Create opens tbz2 for writing a compressed
-// tar archive to out.
-func (tbz2 *TarBz2) Create(out io.Writer) error {
-	tbz2.wrapWriter()
-	return tbz2.Tar.Create(out)
-}
-
-// Open opens t for reading a compressed archive from
-// in. The size parameter is not used.
-func (tbz2 *TarBz2) Open(in io.Reader, size int64) error {
-	tbz2.wrapReader()
-	return tbz2.Tar.Open(in, size)
-}
-
-// Extract extracts a single file from the tar archive.
-// If the target is a directory, the entire folder will
-// be extracted into destination.
-func (tbz2 *TarBz2) Extract(source, target, destination string) error {
-	tbz2.wrapReader()
-	return tbz2.Tar.Extract(source, target, destination)
-}
-
-func (tbz2 *TarBz2) wrapWriter() {
-	var bz2w *bzip2.Writer
-	tbz2.Tar.writerWrapFn = func(w io.Writer) (io.Writer, error) {
-		var err error
-		bz2w, err = bzip2.NewWriter(w, &bzip2.WriterConfig{
-			Level: tbz2.CompressionLevel,
-		})
-		return bz2w, err
-	}
-	tbz2.Tar.cleanupWrapFn = func() {
-		bz2w.Close()
-	}
-}
-
-func (tbz2 *TarBz2) wrapReader() {
-	var bz2r *bzip2.Reader
-	tbz2.Tar.readerWrapFn = func(r io.Reader) (io.Reader, error) {
-		var err error
-		bz2r, err = bzip2.NewReader(r, nil)
-		return bz2r, err
-	}
-	tbz2.Tar.cleanupWrapFn = func() {
-		bz2r.Close()
-	}
-}
-
-func (tbz2 *TarBz2) String() string { return "tar.bz2" }
-
-// NewTarBz2 returns a new, default instance ready to be customized and used.
-func NewTarBz2() *TarBz2 {
-	return &TarBz2{
-		CompressionLevel: bzip2.DefaultCompression,
-		Tar:              NewTar(),
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Reader(new(TarBz2))
-	_ = Writer(new(TarBz2))
-	_ = Archiver(new(TarBz2))
-	_ = Unarchiver(new(TarBz2))
-	_ = Walker(new(TarBz2))
-	_ = Extractor(new(TarBz2))
-)
-
-// DefaultTarBz2 is a convenient archiver ready to use.
-var DefaultTarBz2 = NewTarBz2()
diff --git a/vendor/github.com/mholt/archiver/v3/targz.go b/vendor/github.com/mholt/archiver/v3/targz.go
deleted file mode 100644
index 283fd01b2b..0000000000
--- a/vendor/github.com/mholt/archiver/v3/targz.go
+++ /dev/null
@@ -1,137 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"strings"
-
-	"github.com/klauspost/compress/gzip"
-	"github.com/klauspost/pgzip"
-)
-
-// TarGz facilitates gzip compression
-// (RFC 1952) of tarball archives.
-type TarGz struct {
-	*Tar
-
-	// The compression level to use, as described
-	// in the compress/gzip package.
-	CompressionLevel int
-
-	// Disables parallel gzip.
-	SingleThreaded bool
-}
-
-// CheckExt ensures the file extension matches the format.
-func (*TarGz) CheckExt(filename string) error {
-	if !strings.HasSuffix(filename, ".tar.gz") &&
-		!strings.HasSuffix(filename, ".tgz") {
-		return fmt.Errorf("filename must have a .tar.gz or .tgz extension")
-	}
-	return nil
-}
-
-// Archive creates a compressed tar file at destination
-// containing the files listed in sources. The destination
-// must end with ".tar.gz" or ".tgz". File paths can be
-// those of regular files or directories; directories will
-// be recursively added.
-func (tgz *TarGz) Archive(sources []string, destination string) error {
-	err := tgz.CheckExt(destination)
-	if err != nil {
-		return fmt.Errorf("output %s", err.Error())
-	}
-	tgz.wrapWriter()
-	return tgz.Tar.Archive(sources, destination)
-}
-
-// Unarchive unpacks the compressed tarball at
-// source to destination. Destination will be
-// treated as a folder name.
-func (tgz *TarGz) Unarchive(source, destination string) error {
-	tgz.wrapReader()
-	return tgz.Tar.Unarchive(source, destination)
-}
-
-// Walk calls walkFn for each visited item in archive.
-func (tgz *TarGz) Walk(archive string, walkFn WalkFunc) error {
-	tgz.wrapReader()
-	return tgz.Tar.Walk(archive, walkFn)
-}
-
-// Create opens txz for writing a compressed
-// tar archive to out.
-func (tgz *TarGz) Create(out io.Writer) error {
-	tgz.wrapWriter()
-	return tgz.Tar.Create(out)
-}
-
-// Open opens t for reading a compressed archive from
-// in. The size parameter is not used.
-func (tgz *TarGz) Open(in io.Reader, size int64) error {
-	tgz.wrapReader()
-	return tgz.Tar.Open(in, size)
-}
-
-// Extract extracts a single file from the tar archive.
-// If the target is a directory, the entire folder will
-// be extracted into destination.
-func (tgz *TarGz) Extract(source, target, destination string) error {
-	tgz.wrapReader()
-	return tgz.Tar.Extract(source, target, destination)
-}
-
-func (tgz *TarGz) wrapWriter() {
-	var gzw io.WriteCloser
-	tgz.Tar.writerWrapFn = func(w io.Writer) (io.Writer, error) {
-		var err error
-		if tgz.SingleThreaded {
-			gzw, err = gzip.NewWriterLevel(w, tgz.CompressionLevel)
-		} else {
-			gzw, err = pgzip.NewWriterLevel(w, tgz.CompressionLevel)
-		}
-		return gzw, err
-	}
-	tgz.Tar.cleanupWrapFn = func() {
-		gzw.Close()
-	}
-}
-
-func (tgz *TarGz) wrapReader() {
-	var gzr io.ReadCloser
-	tgz.Tar.readerWrapFn = func(r io.Reader) (io.Reader, error) {
-		var err error
-		if tgz.SingleThreaded {
-			gzr, err = gzip.NewReader(r)
-		} else {
-			gzr, err = pgzip.NewReader(r)
-		}
-		return gzr, err
-	}
-	tgz.Tar.cleanupWrapFn = func() {
-		gzr.Close()
-	}
-}
-
-func (tgz *TarGz) String() string { return "tar.gz" }
-
-// NewTarGz returns a new, default instance ready to be customized and used.
-func NewTarGz() *TarGz {
-	return &TarGz{
-		CompressionLevel: gzip.DefaultCompression,
-		Tar:              NewTar(),
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Reader(new(TarGz))
-	_ = Writer(new(TarGz))
-	_ = Archiver(new(TarGz))
-	_ = Unarchiver(new(TarGz))
-	_ = Walker(new(TarGz))
-	_ = Extractor(new(TarGz))
-)
-
-// DefaultTarGz is a convenient archiver ready to use.
-var DefaultTarGz = NewTarGz()
diff --git a/vendor/github.com/mholt/archiver/v3/tarlz4.go b/vendor/github.com/mholt/archiver/v3/tarlz4.go
deleted file mode 100644
index 42cbc90bbb..0000000000
--- a/vendor/github.com/mholt/archiver/v3/tarlz4.go
+++ /dev/null
@@ -1,129 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"strings"
-
-	"github.com/pierrec/lz4/v4"
-)
-
-// TarLz4 facilitates lz4 compression
-// (https://github.com/lz4/lz4/tree/master/doc)
-// of tarball archives.
-type TarLz4 struct {
-	*Tar
-
-	// The compression level to use when writing.
-	// Minimum 0 (fast compression), maximum 12
-	// (most space savings).
-	CompressionLevel int
-}
-
-// CheckExt ensures the file extension matches the format.
-func (*TarLz4) CheckExt(filename string) error {
-	if !strings.HasSuffix(filename, ".tar.lz4") &&
-		!strings.HasSuffix(filename, ".tlz4") {
-
-		return fmt.Errorf("filename must have a .tar.lz4 or .tlz4 extension")
-	}
-	return nil
-}
-
-// Archive creates a compressed tar file at destination
-// containing the files listed in sources. The destination
-// must end with ".tar.lz4" or ".tlz4". File paths can be
-// those of regular files or directories; directories will
-// be recursively added.
-func (tlz4 *TarLz4) Archive(sources []string, destination string) error {
-	err := tlz4.CheckExt(destination)
-	if err != nil {
-		return fmt.Errorf("output %s", err.Error())
-	}
-	tlz4.wrapWriter()
-	return tlz4.Tar.Archive(sources, destination)
-}
-
-// Unarchive unpacks the compressed tarball at
-// source to destination. Destination will be
-// treated as a folder name.
-func (tlz4 *TarLz4) Unarchive(source, destination string) error {
-	tlz4.wrapReader()
-	return tlz4.Tar.Unarchive(source, destination)
-}
-
-// Walk calls walkFn for each visited item in archive.
-func (tlz4 *TarLz4) Walk(archive string, walkFn WalkFunc) error {
-	tlz4.wrapReader()
-	return tlz4.Tar.Walk(archive, walkFn)
-}
-
-// Create opens tlz4 for writing a compressed
-// tar archive to out.
-func (tlz4 *TarLz4) Create(out io.Writer) error {
-	tlz4.wrapWriter()
-	return tlz4.Tar.Create(out)
-}
-
-// Open opens t for reading a compressed archive from
-// in. The size parameter is not used.
-func (tlz4 *TarLz4) Open(in io.Reader, size int64) error {
-	tlz4.wrapReader()
-	return tlz4.Tar.Open(in, size)
-}
-
-// Extract extracts a single file from the tar archive.
-// If the target is a directory, the entire folder will
-// be extracted into destination.
-func (tlz4 *TarLz4) Extract(source, target, destination string) error {
-	tlz4.wrapReader()
-	return tlz4.Tar.Extract(source, target, destination)
-}
-
-func (tlz4 *TarLz4) wrapWriter() {
-	var lz4w *lz4.Writer
-	tlz4.Tar.writerWrapFn = func(w io.Writer) (io.Writer, error) {
-		lz4w = lz4.NewWriter(w)
-		// TODO archiver v4: use proper lz4.Fast
-		// bitshifting for backwards compatibility with lz4/v3
-		options := []lz4.Option{
-			lz4.CompressionLevelOption(lz4.CompressionLevel(1 << (8 + tlz4.CompressionLevel))),
-		}
-		if err := lz4w.Apply(options...); err != nil {
-			return lz4w, err
-		}
-		return lz4w, nil
-	}
-	tlz4.Tar.cleanupWrapFn = func() {
-		lz4w.Close()
-	}
-}
-
-func (tlz4 *TarLz4) wrapReader() {
-	tlz4.Tar.readerWrapFn = func(r io.Reader) (io.Reader, error) {
-		return lz4.NewReader(r), nil
-	}
-}
-
-func (tlz4 *TarLz4) String() string { return "tar.lz4" }
-
-// NewTarLz4 returns a new, default instance ready to be customized and used.
-func NewTarLz4() *TarLz4 {
-	return &TarLz4{
-		CompressionLevel: 9, // https://github.com/lz4/lz4/blob/1b819bfd633ae285df2dfe1b0589e1ec064f2873/lib/lz4hc.h#L48
-		Tar:              NewTar(),
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Reader(new(TarLz4))
-	_ = Writer(new(TarLz4))
-	_ = Archiver(new(TarLz4))
-	_ = Unarchiver(new(TarLz4))
-	_ = Walker(new(TarLz4))
-	_ = Extractor(new(TarLz4))
-)
-
-// DefaultTarLz4 is a convenient archiver ready to use.
-var DefaultTarLz4 = NewTarLz4()
diff --git a/vendor/github.com/mholt/archiver/v3/tarsz.go b/vendor/github.com/mholt/archiver/v3/tarsz.go
deleted file mode 100644
index ee3808e63d..0000000000
--- a/vendor/github.com/mholt/archiver/v3/tarsz.go
+++ /dev/null
@@ -1,114 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"strings"
-
-	"github.com/golang/snappy"
-)
-
-// TarSz facilitates Snappy compression
-// (https://github.com/google/snappy)
-// of tarball archives.
-type TarSz struct {
-	*Tar
-}
-
-// CheckExt ensures the file extension matches the format.
-func (*TarSz) CheckExt(filename string) error {
-	if !strings.HasSuffix(filename, ".tar.sz") &&
-		!strings.HasSuffix(filename, ".tsz") {
-		return fmt.Errorf("filename must have a .tar.sz or .tsz extension")
-	}
-	return nil
-}
-
-// Archive creates a compressed tar file at destination
-// containing the files listed in sources. The destination
-// must end with ".tar.sz" or ".tsz". File paths can be
-// those of regular files or directories; directories will
-// be recursively added.
-func (tsz *TarSz) Archive(sources []string, destination string) error {
-	err := tsz.CheckExt(destination)
-	if err != nil {
-		return fmt.Errorf("output %s", err.Error())
-	}
-	tsz.wrapWriter()
-	return tsz.Tar.Archive(sources, destination)
-}
-
-// Unarchive unpacks the compressed tarball at
-// source to destination. Destination will be
-// treated as a folder name.
-func (tsz *TarSz) Unarchive(source, destination string) error {
-	tsz.wrapReader()
-	return tsz.Tar.Unarchive(source, destination)
-}
-
-// Walk calls walkFn for each visited item in archive.
-func (tsz *TarSz) Walk(archive string, walkFn WalkFunc) error {
-	tsz.wrapReader()
-	return tsz.Tar.Walk(archive, walkFn)
-}
-
-// Create opens tsz for writing a compressed
-// tar archive to out.
-func (tsz *TarSz) Create(out io.Writer) error {
-	tsz.wrapWriter()
-	return tsz.Tar.Create(out)
-}
-
-// Open opens t for reading a compressed archive from
-// in. The size parameter is not used.
-func (tsz *TarSz) Open(in io.Reader, size int64) error {
-	tsz.wrapReader()
-	return tsz.Tar.Open(in, size)
-}
-
-// Extract extracts a single file from the tar archive.
-// If the target is a directory, the entire folder will
-// be extracted into destination.
-func (tsz *TarSz) Extract(source, target, destination string) error {
-	tsz.wrapReader()
-	return tsz.Tar.Extract(source, target, destination)
-}
-
-func (tsz *TarSz) wrapWriter() {
-	var sw *snappy.Writer
-	tsz.Tar.writerWrapFn = func(w io.Writer) (io.Writer, error) {
-		sw = snappy.NewBufferedWriter(w)
-		return sw, nil
-	}
-	tsz.Tar.cleanupWrapFn = func() {
-		sw.Close()
-	}
-}
-
-func (tsz *TarSz) wrapReader() {
-	tsz.Tar.readerWrapFn = func(r io.Reader) (io.Reader, error) {
-		return snappy.NewReader(r), nil
-	}
-}
-
-func (tsz *TarSz) String() string { return "tar.sz" }
-
-// NewTarSz returns a new, default instance ready to be customized and used.
-func NewTarSz() *TarSz {
-	return &TarSz{
-		Tar: NewTar(),
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Reader(new(TarSz))
-	_ = Writer(new(TarSz))
-	_ = Archiver(new(TarSz))
-	_ = Unarchiver(new(TarSz))
-	_ = Walker(new(TarSz))
-	_ = Extractor(new(TarSz))
-)
-
-// DefaultTarSz is a convenient archiver ready to use.
-var DefaultTarSz = NewTarSz()
diff --git a/vendor/github.com/mholt/archiver/v3/tarxz.go b/vendor/github.com/mholt/archiver/v3/tarxz.go
deleted file mode 100644
index 5679a067be..0000000000
--- a/vendor/github.com/mholt/archiver/v3/tarxz.go
+++ /dev/null
@@ -1,119 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"strings"
-
-	"github.com/ulikunitz/xz"
-	fastxz "github.com/xi2/xz"
-)
-
-// TarXz facilitates xz compression
-// (https://tukaani.org/xz/format.html)
-// of tarball archives.
-type TarXz struct {
-	*Tar
-}
-
-// CheckExt ensures the file extension matches the format.
-func (*TarXz) CheckExt(filename string) error {
-	if !strings.HasSuffix(filename, ".tar.xz") &&
-		!strings.HasSuffix(filename, ".txz") {
-		return fmt.Errorf("filename must have a .tar.xz or .txz extension")
-	}
-	return nil
-}
-
-// Archive creates a compressed tar file at destination
-// containing the files listed in sources. The destination
-// must end with ".tar.xz" or ".txz". File paths can be
-// those of regular files or directories; directories will
-// be recursively added.
-func (txz *TarXz) Archive(sources []string, destination string) error {
-	err := txz.CheckExt(destination)
-	if err != nil {
-		return fmt.Errorf("output %s", err.Error())
-	}
-	txz.wrapWriter()
-	return txz.Tar.Archive(sources, destination)
-}
-
-// Unarchive unpacks the compressed tarball at
-// source to destination. Destination will be
-// treated as a folder name.
-func (txz *TarXz) Unarchive(source, destination string) error {
-	txz.wrapReader()
-	return txz.Tar.Unarchive(source, destination)
-}
-
-// Walk calls walkFn for each visited item in archive.
-func (txz *TarXz) Walk(archive string, walkFn WalkFunc) error {
-	txz.wrapReader()
-	return txz.Tar.Walk(archive, walkFn)
-}
-
-// Create opens txz for writing a compressed
-// tar archive to out.
-func (txz *TarXz) Create(out io.Writer) error {
-	txz.wrapWriter()
-	return txz.Tar.Create(out)
-}
-
-// Open opens t for reading a compressed archive from
-// in. The size parameter is not used.
-func (txz *TarXz) Open(in io.Reader, size int64) error {
-	txz.wrapReader()
-	return txz.Tar.Open(in, size)
-}
-
-// Extract extracts a single file from the tar archive.
-// If the target is a directory, the entire folder will
-// be extracted into destination.
-func (txz *TarXz) Extract(source, target, destination string) error {
-	txz.wrapReader()
-	return txz.Tar.Extract(source, target, destination)
-}
-
-func (txz *TarXz) wrapWriter() {
-	var xzw *xz.Writer
-	txz.Tar.writerWrapFn = func(w io.Writer) (io.Writer, error) {
-		var err error
-		xzw, err = xz.NewWriter(w)
-		return xzw, err
-	}
-	txz.Tar.cleanupWrapFn = func() {
-		xzw.Close()
-	}
-}
-
-func (txz *TarXz) wrapReader() {
-	var xzr *fastxz.Reader
-	txz.Tar.readerWrapFn = func(r io.Reader) (io.Reader, error) {
-		var err error
-		xzr, err = fastxz.NewReader(r, 0)
-		return xzr, err
-	}
-}
-
-func (txz *TarXz) String() string { return "tar.xz" }
-
-// NewTarXz returns a new, default instance ready to be customized and used.
-func NewTarXz() *TarXz {
-	return &TarXz{
-		Tar: NewTar(),
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Reader(new(TarXz))
-	_ = Writer(new(TarXz))
-	_ = Archiver(new(TarXz))
-	_ = Unarchiver(new(TarXz))
-	_ = Walker(new(TarXz))
-	_ = Extractor(new(TarXz))
-)
-
-// DefaultTarXz is a convenient archiver ready to use.
-var DefaultTarXz = NewTarXz()
diff --git a/vendor/github.com/mholt/archiver/v3/tarzst.go b/vendor/github.com/mholt/archiver/v3/tarzst.go
deleted file mode 100644
index 3b2fe43189..0000000000
--- a/vendor/github.com/mholt/archiver/v3/tarzst.go
+++ /dev/null
@@ -1,120 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"strings"
-
-	"github.com/klauspost/compress/zstd"
-)
-
-// TarZstd facilitates Zstandard compression
-// (RFC 8478) of tarball archives.
-type TarZstd struct {
-	*Tar
-}
-
-// CheckExt ensures the file extension matches the format.
-func (*TarZstd) CheckExt(filename string) error {
-	if !strings.HasSuffix(filename, ".tar.zst") {
-		return fmt.Errorf("filename must have a .tar.zst extension")
-	}
-	return nil
-}
-
-// Archive creates a compressed tar file at destination
-// containing the files listed in sources. The destination
-// must end with ".tar.zst" or ".tzst". File paths can be
-// those of regular files or directories; directories will
-// be recursively added.
-func (tzst *TarZstd) Archive(sources []string, destination string) error {
-	err := tzst.CheckExt(destination)
-	if err != nil {
-		return fmt.Errorf("output %s", err.Error())
-	}
-	tzst.wrapWriter()
-	return tzst.Tar.Archive(sources, destination)
-}
-
-// Unarchive unpacks the compressed tarball at
-// source to destination. Destination will be
-// treated as a folder name.
-func (tzst *TarZstd) Unarchive(source, destination string) error {
-	tzst.wrapReader()
-	return tzst.Tar.Unarchive(source, destination)
-}
-
-// Walk calls walkFn for each visited item in archive.
-func (tzst *TarZstd) Walk(archive string, walkFn WalkFunc) error {
-	tzst.wrapReader()
-	return tzst.Tar.Walk(archive, walkFn)
-}
-
-// Create opens txz for writing a compressed
-// tar archive to out.
-func (tzst *TarZstd) Create(out io.Writer) error {
-	tzst.wrapWriter()
-	return tzst.Tar.Create(out)
-}
-
-// Open opens t for reading a compressed archive from
-// in. The size parameter is not used.
-func (tzst *TarZstd) Open(in io.Reader, size int64) error {
-	tzst.wrapReader()
-	return tzst.Tar.Open(in, size)
-}
-
-// Extract extracts a single file from the tar archive.
-// If the target is a directory, the entire folder will
-// be extracted into destination.
-func (tzst *TarZstd) Extract(source, target, destination string) error {
-	tzst.wrapReader()
-	return tzst.Tar.Extract(source, target, destination)
-}
-
-func (tzst *TarZstd) wrapWriter() {
-	var zstdw *zstd.Encoder
-	tzst.Tar.writerWrapFn = func(w io.Writer) (io.Writer, error) {
-		var err error
-		zstdw, err = zstd.NewWriter(w)
-		return zstdw, err
-	}
-	tzst.Tar.cleanupWrapFn = func() {
-		zstdw.Close()
-	}
-}
-
-func (tzst *TarZstd) wrapReader() {
-	var zstdr *zstd.Decoder
-	tzst.Tar.readerWrapFn = func(r io.Reader) (io.Reader, error) {
-		var err error
-		zstdr, err = zstd.NewReader(r)
-		return zstdr, err
-	}
-	tzst.Tar.cleanupWrapFn = func() {
-		zstdr.Close()
-	}
-}
-
-func (tzst *TarZstd) String() string { return "tar.zst" }
-
-// NewTarZstd returns a new, default instance ready to be customized and used.
-func NewTarZstd() *TarZstd {
-	return &TarZstd{
-		Tar: NewTar(),
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Reader(new(TarZstd))
-	_ = Writer(new(TarZstd))
-	_ = Archiver(new(TarZstd))
-	_ = Unarchiver(new(TarZstd))
-	_ = Walker(new(TarZstd))
-	_ = ExtensionChecker(new(TarZstd))
-	_ = Extractor(new(TarZstd))
-)
-
-// DefaultTarZstd is a convenient archiver ready to use.
-var DefaultTarZstd = NewTarZstd()
diff --git a/vendor/github.com/mholt/archiver/v3/xz.go b/vendor/github.com/mholt/archiver/v3/xz.go
deleted file mode 100644
index c60d5eaec6..0000000000
--- a/vendor/github.com/mholt/archiver/v3/xz.go
+++ /dev/null
@@ -1,58 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"path/filepath"
-
-	"github.com/ulikunitz/xz"
-	fastxz "github.com/xi2/xz"
-)
-
-// Xz facilitates XZ compression.
-type Xz struct{}
-
-// Compress reads in, compresses it, and writes it to out.
-func (x *Xz) Compress(in io.Reader, out io.Writer) error {
-	w, err := xz.NewWriter(out)
-	if err != nil {
-		return err
-	}
-	defer w.Close()
-	_, err = io.Copy(w, in)
-	return err
-}
-
-// Decompress reads in, decompresses it, and writes it to out.
-func (x *Xz) Decompress(in io.Reader, out io.Writer) error {
-	r, err := fastxz.NewReader(in, 0)
-	if err != nil {
-		return err
-	}
-	_, err = io.Copy(out, r)
-	return err
-}
-
-// CheckExt ensures the file extension matches the format.
-func (x *Xz) CheckExt(filename string) error {
-	if filepath.Ext(filename) != ".xz" {
-		return fmt.Errorf("filename must have a .xz extension")
-	}
-	return nil
-}
-
-func (x *Xz) String() string { return "xz" }
-
-// NewXz returns a new, default instance ready to be customized and used.
-func NewXz() *Xz {
-	return new(Xz)
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Compressor(new(Xz))
-	_ = Decompressor(new(Xz))
-)
-
-// DefaultXz is a default instance that is conveniently ready to use.
-var DefaultXz = NewXz()
diff --git a/vendor/github.com/mholt/archiver/v3/zip.go b/vendor/github.com/mholt/archiver/v3/zip.go
deleted file mode 100644
index c6af8efbc6..0000000000
--- a/vendor/github.com/mholt/archiver/v3/zip.go
+++ /dev/null
@@ -1,711 +0,0 @@
-package archiver
-
-import (
-	"bytes"
-	"compress/flate"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"log"
-	"os"
-	"path"
-	"path/filepath"
-	"strings"
-
-	"github.com/dsnet/compress/bzip2"
-	"github.com/klauspost/compress/zip"
-	"github.com/klauspost/compress/zstd"
-	"github.com/ulikunitz/xz"
-)
-
-// ZipCompressionMethod Compression type
-type ZipCompressionMethod uint16
-
-// Compression methods.
-// see https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT.
-// Note LZMA: Disabled - because 7z isn't able to unpack ZIP+LZMA ZIP+LZMA2 archives made this way - and vice versa.
-const (
-	Store   ZipCompressionMethod = 0
-	Deflate ZipCompressionMethod = 8
-	BZIP2   ZipCompressionMethod = 12
-	LZMA    ZipCompressionMethod = 14
-	ZSTD    ZipCompressionMethod = 93
-	XZ      ZipCompressionMethod = 95
-)
-
-// Zip provides facilities for operating ZIP archives.
-// See https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT.
-type Zip struct {
-	// The compression level to use, as described
-	// in the compress/flate package.
-	CompressionLevel int
-
-	// Whether to overwrite existing files; if false,
-	// an error is returned if the file exists.
-	OverwriteExisting bool
-
-	// Whether to make all the directories necessary
-	// to create a zip archive in the desired path.
-	MkdirAll bool
-
-	// If enabled, selective compression will only
-	// compress files which are not already in a
-	// compressed format; this is decided based
-	// simply on file extension.
-	SelectiveCompression bool
-
-	// A single top-level folder can be implicitly
-	// created by the Archive or Unarchive methods
-	// if the files to be added to the archive
-	// or the files to be extracted from the archive
-	// do not all have a common root. This roughly
-	// mimics the behavior of archival tools integrated
-	// into OS file browsers which create a subfolder
-	// to avoid unexpectedly littering the destination
-	// folder with potentially many files, causing a
-	// problematic cleanup/organization situation.
-	// This feature is available for both creation
-	// and extraction of archives, but may be slightly
-	// inefficient with lots and lots of files,
-	// especially on extraction.
-	ImplicitTopLevelFolder bool
-
-	// Strip number of leading paths. This feature is available
-	// only during unpacking of the entire archive.
-	StripComponents int
-
-	// If true, errors encountered during reading
-	// or writing a single file will be logged and
-	// the operation will continue on remaining files.
-	ContinueOnError bool
-
-	// Compression algorithm
-	FileMethod ZipCompressionMethod
-	zw         *zip.Writer
-	zr         *zip.Reader
-	ridx       int
-	//decinitialized bool
-}
-
-// CheckExt ensures the file extension matches the format.
-func (*Zip) CheckExt(filename string) error {
-	if !strings.HasSuffix(filename, ".zip") {
-		return fmt.Errorf("filename must have a .zip extension")
-	}
-	return nil
-}
-
-// Registering a global decompressor is not reentrant and may panic
-func registerDecompressor(zr *zip.Reader) {
-	// register zstd decompressor
-	zr.RegisterDecompressor(uint16(ZSTD), func(r io.Reader) io.ReadCloser {
-		zr, err := zstd.NewReader(r)
-		if err != nil {
-			return nil
-		}
-		return zr.IOReadCloser()
-	})
-	zr.RegisterDecompressor(uint16(BZIP2), func(r io.Reader) io.ReadCloser {
-		bz2r, err := bzip2.NewReader(r, nil)
-		if err != nil {
-			return nil
-		}
-		return bz2r
-	})
-	zr.RegisterDecompressor(uint16(XZ), func(r io.Reader) io.ReadCloser {
-		xr, err := xz.NewReader(r)
-		if err != nil {
-			return nil
-		}
-		return ioutil.NopCloser(xr)
-	})
-}
-
-// CheckPath ensures the file extension matches the format.
-func (*Zip) CheckPath(to, filename string) error {
-	to, _ = filepath.Abs(to) //explicit the destination folder to prevent that 'string.HasPrefix' check can be 'bypassed' when no destination folder is supplied in input
-	dest := filepath.Join(to, filename)
-	//prevent path traversal attacks
-	if !strings.HasPrefix(dest, to) {
-		return &IllegalPathError{AbsolutePath: dest, Filename: filename}
-	}
-	return nil
-}
-
-// Archive creates a .zip file at destination containing
-// the files listed in sources. The destination must end
-// with ".zip". File paths can be those of regular files
-// or directories. Regular files are stored at the 'root'
-// of the archive, and directories are recursively added.
-func (z *Zip) Archive(sources []string, destination string) error {
-	err := z.CheckExt(destination)
-	if err != nil {
-		return fmt.Errorf("checking extension: %v", err)
-	}
-	if !z.OverwriteExisting && fileExists(destination) {
-		return fmt.Errorf("file already exists: %s", destination)
-	}
-
-	// make the folder to contain the resulting archive
-	// if it does not already exist
-	destDir := filepath.Dir(destination)
-	if z.MkdirAll && !fileExists(destDir) {
-		err := mkdir(destDir, 0755)
-		if err != nil {
-			return fmt.Errorf("making folder for destination: %v", err)
-		}
-	}
-
-	out, err := os.Create(destination)
-	if err != nil {
-		return fmt.Errorf("creating %s: %v", destination, err)
-	}
-	defer out.Close()
-
-	err = z.Create(out)
-	if err != nil {
-		return fmt.Errorf("creating zip: %v", err)
-	}
-	defer z.Close()
-
-	var topLevelFolder string
-	if z.ImplicitTopLevelFolder && multipleTopLevels(sources) {
-		topLevelFolder = folderNameFromFileName(destination)
-	}
-
-	for _, source := range sources {
-		err := z.writeWalk(source, topLevelFolder, destination)
-		if err != nil {
-			return fmt.Errorf("walking %s: %v", source, err)
-		}
-	}
-
-	return nil
-}
-
-// Unarchive unpacks the .zip file at source to destination.
-// Destination will be treated as a folder name.
-func (z *Zip) Unarchive(source, destination string) error {
-	if !fileExists(destination) && z.MkdirAll {
-		err := mkdir(destination, 0755)
-		if err != nil {
-			return fmt.Errorf("preparing destination: %v", err)
-		}
-	}
-
-	file, err := os.Open(source)
-	if err != nil {
-		return fmt.Errorf("opening source file: %v", err)
-	}
-	defer file.Close()
-
-	fileInfo, err := file.Stat()
-	if err != nil {
-		return fmt.Errorf("statting source file: %v", err)
-	}
-
-	err = z.Open(file, fileInfo.Size())
-	if err != nil {
-		return fmt.Errorf("opening zip archive for reading: %v", err)
-	}
-	defer z.Close()
-
-	// if the files in the archive do not all share a common
-	// root, then make sure we extract to a single subfolder
-	// rather than potentially littering the destination...
-	if z.ImplicitTopLevelFolder {
-		files := make([]string, len(z.zr.File))
-		for i := range z.zr.File {
-			files[i] = z.zr.File[i].Name
-		}
-		if multipleTopLevels(files) {
-			destination = filepath.Join(destination, folderNameFromFileName(source))
-		}
-	}
-
-	for {
-		err := z.extractNext(destination)
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			if z.ContinueOnError || IsIllegalPathError(err) {
-				log.Printf("[ERROR] Reading file in zip archive: %v", err)
-				continue
-			}
-			return fmt.Errorf("reading file in zip archive: %v", err)
-		}
-	}
-
-	return nil
-}
-
-func (z *Zip) extractNext(to string) error {
-	f, err := z.Read()
-	if err != nil {
-		return err // don't wrap error; calling loop must break on io.EOF
-	}
-	defer f.Close()
-
-	header, ok := f.Header.(zip.FileHeader)
-	if !ok {
-		return fmt.Errorf("expected header to be zip.FileHeader but was %T", f.Header)
-	}
-
-	errPath := z.CheckPath(to, header.Name)
-	if errPath != nil {
-		return fmt.Errorf("checking path traversal attempt: %v", errPath)
-	}
-
-	if z.StripComponents > 0 {
-		if strings.Count(header.Name, "/") < z.StripComponents {
-			return nil // skip path with fewer components
-		}
-
-		for i := 0; i < z.StripComponents; i++ {
-			slash := strings.Index(header.Name, "/")
-			header.Name = header.Name[slash+1:]
-		}
-	}
-	return z.extractFile(f, to, &header)
-}
-
-func (z *Zip) extractFile(f File, to string, header *zip.FileHeader) error {
-	to = filepath.Join(to, header.Name)
-
-	// if a directory, no content; simply make the directory and return
-	if f.IsDir() {
-		return mkdir(to, f.Mode())
-	}
-
-	// do not overwrite existing files, if configured
-	if !z.OverwriteExisting && fileExists(to) {
-		return fmt.Errorf("file already exists: %s", to)
-	}
-
-	// extract symbolic links as symbolic links
-	if isSymlink(header.FileInfo()) {
-		// symlink target is the contents of the file
-		buf := new(bytes.Buffer)
-		_, err := io.Copy(buf, f)
-		if err != nil {
-			return fmt.Errorf("%s: reading symlink target: %v", header.Name, err)
-		}
-		return writeNewSymbolicLink(to, strings.TrimSpace(buf.String()))
-	}
-
-	return writeNewFile(to, f, f.Mode())
-}
-
-func (z *Zip) writeWalk(source, topLevelFolder, destination string) error {
-	sourceInfo, err := os.Stat(source)
-	if err != nil {
-		return fmt.Errorf("%s: stat: %v", source, err)
-	}
-	destAbs, err := filepath.Abs(destination)
-	if err != nil {
-		return fmt.Errorf("%s: getting absolute path of destination %s: %v", source, destination, err)
-	}
-
-	return filepath.Walk(source, func(fpath string, info os.FileInfo, err error) error {
-		handleErr := func(err error) error {
-			if z.ContinueOnError {
-				log.Printf("[ERROR] Walking %s: %v", fpath, err)
-				return nil
-			}
-			return err
-		}
-		if err != nil {
-			return handleErr(fmt.Errorf("traversing %s: %v", fpath, err))
-		}
-		if info == nil {
-			return handleErr(fmt.Errorf("%s: no file info", fpath))
-		}
-
-		// make sure we do not copy the output file into the output
-		// file; that results in an infinite loop and disk exhaustion!
-		fpathAbs, err := filepath.Abs(fpath)
-		if err != nil {
-			return handleErr(fmt.Errorf("%s: getting absolute path: %v", fpath, err))
-		}
-		if within(fpathAbs, destAbs) {
-			return nil
-		}
-
-		// build the name to be used within the archive
-		nameInArchive, err := makeNameInArchive(sourceInfo, source, topLevelFolder, fpath)
-		if err != nil {
-			return handleErr(err)
-		}
-
-		var file io.ReadCloser
-		if info.Mode().IsRegular() {
-			file, err = os.Open(fpath)
-			if err != nil {
-				return handleErr(fmt.Errorf("%s: opening: %v", fpath, err))
-			}
-			defer file.Close()
-		}
-		err = z.Write(File{
-			FileInfo: FileInfo{
-				FileInfo:   info,
-				CustomName: nameInArchive,
-				SourcePath: fpath,
-			},
-			ReadCloser: file,
-		})
-		if err != nil {
-			return handleErr(fmt.Errorf("%s: writing: %s", fpath, err))
-		}
-
-		return nil
-	})
-}
-
-// Create opens z for writing a ZIP archive to out.
-func (z *Zip) Create(out io.Writer) error {
-	if z.zw != nil {
-		return fmt.Errorf("zip archive is already created for writing")
-	}
-	z.zw = zip.NewWriter(out)
-	if z.CompressionLevel != flate.DefaultCompression {
-		z.zw.RegisterCompressor(zip.Deflate, func(out io.Writer) (io.WriteCloser, error) {
-			return flate.NewWriter(out, z.CompressionLevel)
-		})
-	}
-	switch z.FileMethod {
-	case BZIP2:
-		z.zw.RegisterCompressor(uint16(BZIP2), func(out io.Writer) (io.WriteCloser, error) {
-			return bzip2.NewWriter(out, &bzip2.WriterConfig{Level: z.CompressionLevel})
-		})
-	case ZSTD:
-		z.zw.RegisterCompressor(uint16(ZSTD), func(out io.Writer) (io.WriteCloser, error) {
-			return zstd.NewWriter(out)
-		})
-	case XZ:
-		z.zw.RegisterCompressor(uint16(XZ), func(out io.Writer) (io.WriteCloser, error) {
-			return xz.NewWriter(out)
-		})
-	}
-	return nil
-}
-
-// Write writes f to z, which must have been opened for writing first.
-func (z *Zip) Write(f File) error {
-	if z.zw == nil {
-		return fmt.Errorf("zip archive was not created for writing first")
-	}
-	if f.FileInfo == nil {
-		return fmt.Errorf("no file info")
-	}
-	if f.FileInfo.Name() == "" {
-		return fmt.Errorf("missing file name")
-	}
-
-	header, err := zip.FileInfoHeader(f)
-	if err != nil {
-		return fmt.Errorf("%s: getting header: %v", f.Name(), err)
-	}
-
-	if f.IsDir() {
-		header.Name += "/" // required - strangely no mention of this in zip spec? but is in godoc...
-		header.Method = zip.Store
-	} else {
-		ext := strings.ToLower(path.Ext(header.Name))
-		if _, ok := compressedFormats[ext]; ok && z.SelectiveCompression {
-			header.Method = zip.Store
-		} else {
-			header.Method = uint16(z.FileMethod)
-		}
-	}
-
-	writer, err := z.zw.CreateHeader(header)
-	if err != nil {
-		return fmt.Errorf("%s: making header: %w", f.Name(), err)
-	}
-
-	return z.writeFile(f, writer)
-}
-
-func (z *Zip) writeFile(f File, writer io.Writer) error {
-	if f.IsDir() {
-		return nil // directories have no contents
-	}
-	if isSymlink(f) {
-		fi, ok := f.FileInfo.(FileInfo)
-		if !ok {
-			return fmt.Errorf("failed to cast fs.FileInfo to archiver.FileInfo: %v", f)
-		}
-		// file body for symlinks is the symlink target
-		linkTarget, err := os.Readlink(fi.SourcePath)
-		if err != nil {
-			return fmt.Errorf("%s: readlink: %v", fi.SourcePath, err)
-		}
-		_, err = writer.Write([]byte(filepath.ToSlash(linkTarget)))
-		if err != nil {
-			return fmt.Errorf("%s: writing symlink target: %v", fi.SourcePath, err)
-		}
-		return nil
-	}
-
-	if f.ReadCloser == nil {
-		return fmt.Errorf("%s: no way to read file contents", f.Name())
-	}
-	_, err := io.Copy(writer, f)
-	if err != nil {
-		return fmt.Errorf("%s: copying contents: %w", f.Name(), err)
-	}
-
-	return nil
-}
-
-// Open opens z for reading an archive from in,
-// which is expected to have the given size and
-// which must be an io.ReaderAt.
-func (z *Zip) Open(in io.Reader, size int64) error {
-	inRdrAt, ok := in.(io.ReaderAt)
-	if !ok {
-		return fmt.Errorf("reader must be io.ReaderAt")
-	}
-	if z.zr != nil {
-		return fmt.Errorf("zip archive is already open for reading")
-	}
-	var err error
-	z.zr, err = zip.NewReader(inRdrAt, size)
-	if err != nil {
-		return fmt.Errorf("creating reader: %v", err)
-	}
-	registerDecompressor(z.zr)
-	z.ridx = 0
-	return nil
-}
-
-// Read reads the next file from z, which must have
-// already been opened for reading. If there are no
-// more files, the error is io.EOF. The File must
-// be closed when finished reading from it.
-func (z *Zip) Read() (File, error) {
-	if z.zr == nil {
-		return File{}, fmt.Errorf("zip archive is not open")
-	}
-	if z.ridx >= len(z.zr.File) {
-		return File{}, io.EOF
-	}
-
-	// access the file and increment counter so that
-	// if there is an error processing this file, the
-	// caller can still iterate to the next file
-	zf := z.zr.File[z.ridx]
-	z.ridx++
-
-	file := File{
-		FileInfo: zf.FileInfo(),
-		Header:   zf.FileHeader,
-	}
-
-	rc, err := zf.Open()
-	if err != nil {
-		return file, fmt.Errorf("%s: open compressed file: %v", zf.Name, err)
-	}
-	file.ReadCloser = rc
-
-	return file, nil
-}
-
-// Close closes the zip archive(s) opened by Create and Open.
-func (z *Zip) Close() error {
-	if z.zr != nil {
-		z.zr = nil
-	}
-	if z.zw != nil {
-		zw := z.zw
-		z.zw = nil
-		return zw.Close()
-	}
-	return nil
-}
-
-// Walk calls walkFn for each visited item in archive.
-func (z *Zip) Walk(archive string, walkFn WalkFunc) error {
-	zr, err := zip.OpenReader(archive)
-	if err != nil {
-		return fmt.Errorf("opening zip reader: %v", err)
-	}
-	defer zr.Close()
-	registerDecompressor(&zr.Reader)
-	for _, zf := range zr.File {
-		zfrc, err := zf.Open()
-		if err != nil {
-			if zfrc != nil {
-				zfrc.Close()
-			}
-			if z.ContinueOnError {
-				log.Printf("[ERROR] Opening %s: %v", zf.Name, err)
-				continue
-			}
-			return fmt.Errorf("opening %s: %v", zf.Name, err)
-		}
-
-		err = walkFn(File{
-			FileInfo:   zf.FileInfo(),
-			Header:     zf.FileHeader,
-			ReadCloser: zfrc,
-		})
-		zfrc.Close()
-		if err != nil {
-			if err == ErrStopWalk {
-				break
-			}
-			if z.ContinueOnError {
-				log.Printf("[ERROR] Walking %s: %v", zf.Name, err)
-				continue
-			}
-			return fmt.Errorf("walking %s: %v", zf.Name, err)
-		}
-	}
-
-	return nil
-}
-
-// Extract extracts a single file from the zip archive.
-// If the target is a directory, the entire folder will
-// be extracted into destination.
-func (z *Zip) Extract(source, target, destination string) error {
-	// target refers to a path inside the archive, which should be clean also
-	target = path.Clean(target)
-
-	// if the target ends up being a directory, then
-	// we will continue walking and extracting files
-	// until we are no longer within that directory
-	var targetDirPath string
-
-	return z.Walk(source, func(f File) error {
-		zfh, ok := f.Header.(zip.FileHeader)
-		if !ok {
-			return fmt.Errorf("expected header to be zip.FileHeader but was %T", f.Header)
-		}
-
-		// importantly, cleaning the path strips tailing slash,
-		// which must be appended to folders within the archive
-		name := path.Clean(zfh.Name)
-		if f.IsDir() && target == name {
-			targetDirPath = path.Dir(name)
-		}
-
-		if within(target, zfh.Name) {
-			// either this is the exact file we want, or is
-			// in the directory we want to extract
-
-			// build the filename we will extract to
-			end, err := filepath.Rel(targetDirPath, zfh.Name)
-			if err != nil {
-				return fmt.Errorf("relativizing paths: %v", err)
-			}
-			joined := filepath.Join(destination, end)
-
-			err = z.extractFile(f, joined, &zfh)
-			if err != nil {
-				return fmt.Errorf("extracting file %s: %v", zfh.Name, err)
-			}
-
-			// if our target was not a directory, stop walk
-			if targetDirPath == "" {
-				return ErrStopWalk
-			}
-		} else if targetDirPath != "" {
-			// finished walking the entire directory
-			return ErrStopWalk
-		}
-
-		return nil
-	})
-}
-
-// Match returns true if the format of file matches this
-// type's format. It should not affect reader position.
-func (*Zip) Match(file io.ReadSeeker) (bool, error) {
-	currentPos, err := file.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return false, err
-	}
-	_, err = file.Seek(0, 0)
-	if err != nil {
-		return false, err
-	}
-	defer func() {
-		_, _ = file.Seek(currentPos, io.SeekStart)
-	}()
-
-	buf := make([]byte, 4)
-	if n, err := file.Read(buf); err != nil || n < 4 {
-		return false, nil
-	}
-	return bytes.Equal(buf, []byte("PK\x03\x04")), nil
-}
-
-func (z *Zip) String() string { return "zip" }
-
-// NewZip returns a new, default instance ready to be customized and used.
-func NewZip() *Zip {
-	return &Zip{
-		CompressionLevel:     flate.DefaultCompression,
-		MkdirAll:             true,
-		SelectiveCompression: true,
-		FileMethod:           Deflate,
-	}
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Reader(new(Zip))
-	_ = Writer(new(Zip))
-	_ = Archiver(new(Zip))
-	_ = Unarchiver(new(Zip))
-	_ = Walker(new(Zip))
-	_ = Extractor(new(Zip))
-	_ = Matcher(new(Zip))
-	_ = ExtensionChecker(new(Zip))
-	_ = FilenameChecker(new(Zip))
-)
-
-// compressedFormats is a (non-exhaustive) set of lowercased
-// file extensions for formats that are typically already
-// compressed. Compressing files that are already compressed
-// is inefficient, so use this set of extension to avoid that.
-var compressedFormats = map[string]struct{}{
-	".7z":   {},
-	".avi":  {},
-	".br":   {},
-	".bz2":  {},
-	".cab":  {},
-	".docx": {},
-	".gif":  {},
-	".gz":   {},
-	".jar":  {},
-	".jpeg": {},
-	".jpg":  {},
-	".lz":   {},
-	".lz4":  {},
-	".lzma": {},
-	".m4v":  {},
-	".mov":  {},
-	".mp3":  {},
-	".mp4":  {},
-	".mpeg": {},
-	".mpg":  {},
-	".png":  {},
-	".pptx": {},
-	".rar":  {},
-	".sz":   {},
-	".tbz2": {},
-	".tgz":  {},
-	".tsz":  {},
-	".txz":  {},
-	".xlsx": {},
-	".xz":   {},
-	".zip":  {},
-	".zipx": {},
-}
-
-// DefaultZip is a default instance that is conveniently ready to use.
-var DefaultZip = NewZip()
diff --git a/vendor/github.com/mholt/archiver/v3/zstd.go b/vendor/github.com/mholt/archiver/v3/zstd.go
deleted file mode 100644
index 60c11efc49..0000000000
--- a/vendor/github.com/mholt/archiver/v3/zstd.go
+++ /dev/null
@@ -1,61 +0,0 @@
-package archiver
-
-import (
-	"fmt"
-	"io"
-	"path/filepath"
-
-	"github.com/klauspost/compress/zstd"
-)
-
-// Zstd facilitates Zstandard compression.
-type Zstd struct {
-	EncoderOptions []zstd.EOption
-	DecoderOptions []zstd.DOption
-}
-
-// Compress reads in, compresses it, and writes it to out.
-func (zs *Zstd) Compress(in io.Reader, out io.Writer) error {
-	w, err := zstd.NewWriter(out, zs.EncoderOptions...)
-	if err != nil {
-		return err
-	}
-	defer w.Close()
-	_, err = io.Copy(w, in)
-	return err
-}
-
-// Decompress reads in, decompresses it, and writes it to out.
-func (zs *Zstd) Decompress(in io.Reader, out io.Writer) error {
-	r, err := zstd.NewReader(in, zs.DecoderOptions...)
-	if err != nil {
-		return err
-	}
-	defer r.Close()
-	_, err = io.Copy(out, r)
-	return err
-}
-
-// CheckExt ensures the file extension matches the format.
-func (zs *Zstd) CheckExt(filename string) error {
-	if filepath.Ext(filename) != ".zst" {
-		return fmt.Errorf("filename must have a .zst extension")
-	}
-	return nil
-}
-
-func (zs *Zstd) String() string { return "zstd" }
-
-// NewZstd returns a new, default instance ready to be customized and used.
-func NewZstd() *Zstd {
-	return new(Zstd)
-}
-
-// Compile-time checks to ensure type implements desired interfaces.
-var (
-	_ = Compressor(new(Zstd))
-	_ = Decompressor(new(Zstd))
-)
-
-// DefaultZstd is a default instance that is conveniently ready to use.
-var DefaultZstd = NewZstd()
diff --git a/vendor/github.com/mholt/archives/.gitignore b/vendor/github.com/mholt/archives/.gitignore
new file mode 100644
index 0000000000..d9ac416228
--- /dev/null
+++ b/vendor/github.com/mholt/archives/.gitignore
@@ -0,0 +1 @@
+_gitignore
\ No newline at end of file
diff --git a/vendor/github.com/mholt/archives/7z.go b/vendor/github.com/mholt/archives/7z.go
new file mode 100644
index 0000000000..2e502035a0
--- /dev/null
+++ b/vendor/github.com/mholt/archives/7z.go
@@ -0,0 +1,125 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"log"
+	"strings"
+
+	"github.com/bodgit/sevenzip"
+)
+
+func init() {
+	RegisterFormat(SevenZip{})
+
+	// looks like the sevenzip package registers a lot of decompressors for us automatically:
+	// https://github.com/bodgit/sevenzip/blob/46c5197162c784318b98b9a3f80289a9aa1ca51a/register.go#L38-L61
+}
+
+type SevenZip struct {
+	// If true, errors encountered during reading or writing
+	// a file within an archive will be logged and the
+	// operation will continue on remaining files.
+	ContinueOnError bool
+
+	// The password, if dealing with an encrypted archive.
+	Password string
+}
+
+func (SevenZip) Extension() string { return ".7z" }
+func (SevenZip) MediaType() string { return "application/x-7z-compressed" }
+
+func (z SevenZip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), z.Extension()) {
+		mr.ByName = true
+	}
+
+	// match file header
+	buf, err := readAtMost(stream, len(sevenZipHeader))
+	if err != nil {
+		return mr, err
+	}
+	mr.ByStream = bytes.Equal(buf, sevenZipHeader)
+
+	return mr, nil
+}
+
+// Archive is not implemented for 7z because I do not know of a pure-Go 7z writer.
+
+// Extract extracts files from z, implementing the Extractor interface. Uniquely, however,
+// sourceArchive must be an io.ReaderAt and io.Seeker, which are oddly disjoint interfaces
+// from io.Reader which is what the method signature requires. We chose this signature for
+// the interface because we figure you can Read() from anything you can ReadAt() or Seek()
+// with. Due to the nature of the zip archive format, if sourceArchive is not an io.Seeker
+// and io.ReaderAt, an error is returned.
+func (z SevenZip) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error {
+	sra, ok := sourceArchive.(seekReaderAt)
+	if !ok {
+		return fmt.Errorf("input type must be an io.ReaderAt and io.Seeker because of zip format constraints")
+	}
+
+	size, err := streamSizeBySeeking(sra)
+	if err != nil {
+		return fmt.Errorf("determining stream size: %w", err)
+	}
+
+	zr, err := sevenzip.NewReaderWithPassword(sra, size, z.Password)
+	if err != nil {
+		return err
+	}
+
+	// important to initialize to non-nil, empty value due to how fileIsIncluded works
+	skipDirs := skipList{}
+
+	for i, f := range zr.File {
+		if err := ctx.Err(); err != nil {
+			return err // honor context cancellation
+		}
+
+		if fileIsIncluded(skipDirs, f.Name) {
+			continue
+		}
+
+		fi := f.FileInfo()
+		file := FileInfo{
+			FileInfo:      fi,
+			Header:        f.FileHeader,
+			NameInArchive: f.Name,
+			Open: func() (fs.File, error) {
+				openedFile, err := f.Open()
+				if err != nil {
+					return nil, err
+				}
+				return fileInArchive{openedFile, fi}, nil
+			},
+		}
+
+		err := handleFile(ctx, file)
+		if errors.Is(err, fs.SkipAll) {
+			break
+		} else if errors.Is(err, fs.SkipDir) && file.IsDir() {
+			skipDirs.add(f.Name)
+		} else if err != nil {
+			if z.ContinueOnError {
+				log.Printf("[ERROR] %s: %v", f.Name, err)
+				continue
+			}
+			return fmt.Errorf("handling file %d: %s: %w", i, f.Name, err)
+		}
+	}
+
+	return nil
+}
+
+// https://py7zr.readthedocs.io/en/latest/archive_format.html#signature
+var sevenZipHeader = []byte("7z\xBC\xAF\x27\x1C")
+
+// Interface guard
+var _ Extractor = SevenZip{}
diff --git a/vendor/github.com/mholt/archiver/v3/LICENSE b/vendor/github.com/mholt/archives/LICENSE
similarity index 100%
rename from vendor/github.com/mholt/archiver/v3/LICENSE
rename to vendor/github.com/mholt/archives/LICENSE
diff --git a/vendor/github.com/mholt/archives/README.md b/vendor/github.com/mholt/archives/README.md
new file mode 100644
index 0000000000..ff174f9c00
--- /dev/null
+++ b/vendor/github.com/mholt/archives/README.md
@@ -0,0 +1,342 @@
+# archives [![Go Reference](https://pkg.go.dev/badge/github.com/mholt/archives.svg)](https://pkg.go.dev/github.com/mholt/archives) [![Linux](https://github.com/mholt/archives/actions/workflows/ubuntu-latest.yml/badge.svg)](https://github.com/mholt/archives/actions/workflows/ubuntu-latest.yml) [![Mac](https://github.com/mholt/archives/actions/workflows/macos-latest.yml/badge.svg)](https://github.com/mholt/archives/actions/workflows/macos-latest.yml) [![Windows](https://github.com/mholt/archives/actions/workflows/windows-latest.yml/badge.svg)](https://github.com/mholt/archives/actions/workflows/windows-latest.yml)
+
+Introducing **mholt/archives** - a cross-platform, multi-format Go library for working with archives and compression formats with a unified API and as virtual file systems compatible with [`io/fs`](https://pkg.go.dev/io/fs). 
+<!--A powerful and flexible library enjoins an elegant CLI in this generic replacement for several platform-specific and format-specific archive utilities.-->
+
+## Features
+
+- Stream-oriented APIs
+- Automatically identify archive and compression formats:
+	- By file name
+	- By stream peeking (headers)
+- Traverse directories, archives, and other files uniformly as [`io/fs`](https://pkg.go.dev/io/fs) file systems:
+	- [`FileFS`](https://pkg.go.dev/github.com/mholt/archives#FileFS)
+	- [`DirFS`](https://pkg.go.dev/github.com/mholt/archives#DirFS)
+	- [`ArchiveFS`](https://pkg.go.dev/github.com/mholt/archives#ArchiveFS)
+- Seamlessly walk into archive files using [`DeepFS`](https://pkg.go.dev/github.com/mholt/archives#DeepFS)
+- Compress and decompress files
+- Create and extract archive files
+- Walk or traverse into archive files
+- Extract only specific files from archives
+- Insert into (append to) .tar and .zip archives without re-creating entire archive
+- Numerous archive and compression formats supported
+- Read from password-protected 7-Zip and RAR files
+- Extensible (add more formats just by registering them)
+- Cross-platform, static binary
+- Pure Go (no cgo)
+- Multithreaded Gzip
+- Adjustable compression levels
+- Super-fast Snappy implementation (via [S2](https://github.com/klauspost/compress/blob/master/s2/README.md))
+
+### Supported compression formats
+
+- brotli (.br)
+- bzip2 (.bz2)
+- flate (.zip)
+- gzip (.gz)
+- lz4 (.lz4)
+- lzip (.lz)
+- minlz (.mz)
+- snappy (.sz) and S2 (.s2)
+- xz (.xz)
+- zlib (.zz)
+- zstandard (.zst)
+
+### Supported archive formats
+
+- .zip
+- .tar (including any compressed variants like .tar.gz)
+- .rar (read-only)
+- .7z (read-only)
+
+## Command line utility
+
+There is an independently-maintained command line tool called [**`arc`**](https://github.com/jm33-m0/arc) currently in development that will expose many of the functions of this library to a shell.
+
+## Library use
+
+```bash
+$ go get github.com/mholt/archives
+```
+
+
+### Create archive
+
+Creating archives can be done entirely without needing a real disk or storage device. All you need is a list of [`FileInfo` structs](https://pkg.go.dev/github.com/mholt/archives#FileInfo), which can be implemented without a real file system.
+
+However, creating archives from a disk is very common, so you can use the [`FilesFromDisk()` function](https://pkg.go.dev/github.com/mholt/archives#FilesFromDisk) to help you map filenames on disk to their paths in the archive.
+
+In this example, we add 4 files and a directory (which includes its contents recursively) to a .tar.gz file:
+
+```go
+ctx := context.TODO()
+
+// map files on disk to their paths in the archive using default settings (second arg)
+files, err := archives.FilesFromDisk(ctx, nil, map[string]string{
+	"/path/on/disk/file1.txt": "file1.txt",
+	"/path/on/disk/file2.txt": "subfolder/file2.txt",
+	"/path/on/disk/file3.txt": "",              // put in root of archive as file3.txt
+	"/path/on/disk/file4.txt": "subfolder/",    // put in subfolder as file4.txt
+	"/path/on/disk/folder":    "Custom Folder", // contents added recursively
+})
+if err != nil {
+	return err
+}
+
+// create the output file we'll write to
+out, err := os.Create("example.tar.gz")
+if err != nil {
+	return err
+}
+defer out.Close()
+
+// we can use the CompressedArchive type to gzip a tarball
+// (since we're writing, we only set Archival, but if you're
+// going to read, set Extraction)
+format := archives.CompressedArchive{
+	Compression: archives.Gz{},
+	Archival:    archives.Tar{},
+}
+
+// create the archive
+err = format.Archive(ctx, out, files)
+if err != nil {
+	return err
+}
+```
+
+### Extract archive
+
+Extracting an archive, extracting _from_ an archive, and walking an archive are all the same function.
+
+Simply use your format type (e.g. `Zip`) to call `Extract()`. You'll pass in a context (for cancellation), the input stream, and a callback function to handle each file.
+
+```go
+// the type that will be used to read the input stream
+var format archives.Zip
+
+err := format.Extract(ctx, input, func(ctx context.Context, f archives.FileInfo) error {
+	// do something with the file here; or, if you only want a specific file or directory,
+	// just return until you come across the desired f.NameInArchive value(s)
+	return nil
+})
+if err != nil {
+	return err
+}
+```
+
+### Identifying formats
+
+When you have an input stream with unknown contents, this package can identify it for you. It will try matching based on filename and/or the header (which peeks at the stream):
+
+```go
+// unless your stream is an io.Seeker, use the returned stream value to
+// ensure you re-read the bytes consumed during Identify()
+format, stream, err := archives.Identify(ctx, "filename.tar.zst", stream)
+if err != nil {
+	return err
+}
+
+// you can now type-assert format to whatever you need
+
+// want to extract something?
+if ex, ok := format.(archives.Extractor); ok {
+	// ... proceed to extract
+}
+
+// or maybe it's compressed and you want to decompress it?
+if decomp, ok := format.(archives.Decompressor); ok {
+	rc, err := decomp.OpenReader(unknownFile)
+	if err != nil {
+		return err
+	}
+	defer rc.Close()
+
+	// read from rc to get decompressed data
+}
+```
+
+`Identify()` works by reading an arbitrary number of bytes from the beginning of the stream (just enough to check for file headers). It buffers them and returns a new reader that lets you re-read them anew. If your input stream is `io.Seeker` however, no buffer is created as it uses `Seek()` instead, and the returned stream is the same as the input.
+
+### Virtual file systems
+
+This is my favorite feature.
+
+Let's say you have a directory on disk, an archive, a compressed archive, any other regular file, or a stream of any of the above! You don't really care; you just want to use it uniformly no matter what it is.
+
+Simply create a file system:
+
+```go
+// filename could be:
+// - a folder ("/home/you/Desktop")
+// - an archive ("example.zip")
+// - a compressed archive ("example.tar.gz")
+// - a regular file ("example.txt")
+// - a compressed regular file ("example.txt.gz")
+// and/or the last argument could be a stream of any of the above
+fsys, err := archives.FileSystem(ctx, filename, nil)
+if err != nil {
+	return err
+}
+```
+
+This is a fully-featured `fs.FS`, so you can open files and read directories, no matter what kind of file the input was.
+
+For example, to open a specific file:
+
+```go
+f, err := fsys.Open("file")
+if err != nil {
+	return err
+}
+defer f.Close()
+```
+
+If you opened a regular file or archive, you can read from it. If it's a compressed file, reads are automatically decompressed.
+
+If you opened a directory (either real or in an archive), you can list its contents:
+
+```go
+if dir, ok := f.(fs.ReadDirFile); ok {
+	// 0 gets all entries, but you can pass > 0 to paginate
+	entries, err := dir.ReadDir(0)
+	if err != nil {
+		return err
+	}
+	for _, e := range entries {
+		fmt.Println(e.Extension())
+	}
+}
+```
+
+Or get a directory listing this way:
+
+```go
+entries, err := fsys.ReadDir("Playlists")
+if err != nil {
+	return err
+}
+for _, e := range entries {
+	fmt.Println(e.Extension())
+}
+```
+
+Or maybe you want to walk all or part of the file system, but skip a folder named `.git`:
+
+```go
+err := fs.WalkDir(fsys, ".", func(path string, d fs.DirEntry, err error) error {
+	if err != nil {
+		return err
+	}
+	if path == ".git" {
+		return fs.SkipDir
+	}
+	fmt.Println("Walking:", path, "Dir?", d.IsDir())
+	return nil
+})
+if err != nil {
+	return err
+}
+```
+
+The `archives` package lets you do it all.
+
+**Important .tar note:** Tar files do not efficiently implement file system semantics due to their historical roots in sequential-access design for tapes. File systems inherently assume some index facilitating random access, but tar files need to be read from the beginning to access something at the end. This is especially slow when the archive is compressed. Optimizations have been implemented to amortize `ReadDir()` calls so that `fs.WalkDir()` only has to scan the archive once, but they use more memory. Open calls require another scan to find the file. It may be more efficient to use `Tar.Extract()` directly if file system semantics are not important to you.
+
+#### Use with `http.FileServer`
+
+It can be used with http.FileServer to browse archives and directories in a browser. However, due to how http.FileServer works, don't directly use http.FileServer with compressed files; instead wrap it like following:
+
+```go
+fileServer := http.FileServer(http.FS(archiveFS))
+http.HandleFunc("/", func(writer http.ResponseWriter, request *http.Request) {
+	// disable range request
+	writer.Header().Set("Accept-Ranges", "none")
+	request.Header.Del("Range")
+	
+	// disable content-type sniffing
+	ctype := mime.TypeByExtension(filepath.Ext(request.URL.Path))
+	writer.Header()["Content-Type"] = nil
+	if ctype != "" {
+		writer.Header().Set("Content-Type", ctype)
+	}
+	fileServer.ServeHTTP(writer, request)
+})
+```
+
+http.FileServer will try to sniff the Content-Type by default if it can't be inferred from file name. To do this, the http package will try to read from the file and then Seek back to file start, which the libray can't achieve currently. The same goes with Range requests. Seeking in archives is not currently supported by this package due to limitations in dependencies.
+
+If Content-Type is desirable, you can [register it](https://pkg.go.dev/mime#AddExtensionType) yourself.
+
+### Compress data
+
+Compression formats let you open writers to compress data:
+
+```go
+// wrap underlying writer w
+compressor, err := archives.Zstd{}.OpenWriter(w)
+if err != nil {
+	return err
+}
+defer compressor.Close()
+
+// writes to compressor will be compressed
+```
+
+### Decompress data
+
+Similarly, compression formats let you open readers to decompress data:
+
+```go
+// wrap underlying reader r
+decompressor, err := archives.Snappy{}.OpenReader(r)
+if err != nil {
+	return err
+}
+defer decompressor.Close()
+
+// reads from decompressor will be decompressed
+```
+
+### Append to tarball and zip archives
+
+Tar and Zip archives can be appended to without creating a whole new archive by calling `Insert()` on a tar or zip stream. However, for tarballs, this requires that the tarball is not compressed (due to complexities with modifying compression dictionaries).
+
+Here is an example that appends a file to a tarball on disk:
+
+```go
+tarball, err := os.OpenFile("example.tar", os.O_RDWR, 0644)
+if err != nil {
+	return err
+}
+defer tarball.Close()
+
+// prepare a text file for the root of the archive
+files, err := archives.FilesFromDisk(nil, map[string]string{
+	"/home/you/lastminute.txt": "",
+})
+
+err := archives.Tar{}.Insert(context.Background(), tarball, files)
+if err != nil {
+	return err
+}
+```
+
+The code is similar for inserting into a Zip archive, except you'll call `Insert()` on a `Zip{}` value instead.
+
+
+### Traverse into archives while walking
+
+If you are traversing/walking the file system using [`fs.WalkDir()`](https://pkg.go.dev/io/fs#WalkDir), the [**`DeepFS`**](https://pkg.go.dev/github.com/mholt/archives#DeepFS) type lets you walk the contents of archives (and compressed archives!) transparently as if the archive file was a regular directory on disk.
+
+Simply root your DeepFS at a real path, then walk away:
+
+```go
+fsys := &archives.DeepFS{Root: "/some/dir"}
+
+err := fs.WalkDir(fsys, ".", func(fpath string, d fs.DirEntry, err error) error {
+	...
+})
+```
+
+You'll notice that paths within archives look like `/some/dir/archive.zip/foo/bar.txt`. If you pass a path like that into `fsys.Open()`, it will split the path at the end of the archive file (`/some/dir/archive.zip`) and use the remainder of the path (`foo/bar.txt`) inside the archive.
diff --git a/vendor/github.com/mholt/archives/archives.go b/vendor/github.com/mholt/archives/archives.go
new file mode 100644
index 0000000000..ee15ce50d5
--- /dev/null
+++ b/vendor/github.com/mholt/archives/archives.go
@@ -0,0 +1,377 @@
+package archives
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"io/fs"
+	"os"
+	"path"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+// FileInfo is a virtualized, generalized file abstraction for interacting with archives.
+type FileInfo struct {
+	fs.FileInfo
+
+	// The file header as used/provided by the archive format.
+	// Typically, you do not need to set this field when creating
+	// an archive.
+	Header any
+
+	// The path of the file as it appears in the archive.
+	// This is equivalent to Header.Name (for most Header
+	// types). We require it to be specified here because
+	// it is such a common field and we want to preserve
+	// format-agnosticism (no type assertions) for basic
+	// operations.
+	//
+	// When extracting, this name or path may not have
+	// been sanitized; it should not be trusted at face
+	// value. Consider using path.Clean() before using.
+	//
+	// If this is blank when inserting a file into an
+	// archive, the filename's base may be assumed
+	// by default to be the name in the archive.
+	NameInArchive string
+
+	// For symbolic and hard links, the target of the link.
+	// Not supported by all archive formats.
+	LinkTarget string
+
+	// A callback function that opens the file to read its
+	// contents. The file must be closed when reading is
+	// complete.
+	Open func() (fs.File, error)
+}
+
+func (f FileInfo) Stat() (fs.FileInfo, error) { return f.FileInfo, nil }
+
+// FilesFromDisk is an opinionated function that returns a list of FileInfos
+// by walking the directories in the filenames map. The keys are the names on
+// disk, and the values become their associated names in the archive.
+//
+// Map keys that specify directories on disk will be walked and added to the
+// archive recursively, rooted at the named directory. They should use the
+// platform's path separator (backslash on Windows; slash on everything else).
+// For convenience, map keys that end in a separator ('/', or '\' on Windows)
+// will enumerate contents only, without adding the folder itself to the archive.
+//
+// Map values should typically use slash ('/') as the separator regardless of
+// the platform, as most archive formats standardize on that rune as the
+// directory separator for filenames within an archive. For convenience, map
+// values that are empty string are interpreted as the base name of the file
+// (sans path) in the root of the archive; and map values that end in a slash
+// will use the base name of the file in that folder of the archive.
+//
+// File gathering will adhere to the settings specified in options.
+//
+// This function is used primarily when preparing a list of files to add to
+// an archive.
+func FilesFromDisk(ctx context.Context, options *FromDiskOptions, filenames map[string]string) ([]FileInfo, error) {
+	var files []FileInfo
+	for rootOnDisk, rootInArchive := range filenames {
+		if err := ctx.Err(); err != nil {
+			return nil, err
+		}
+
+		walkErr := filepath.WalkDir(rootOnDisk, func(filename string, d fs.DirEntry, err error) error {
+			if err := ctx.Err(); err != nil {
+				return err
+			}
+			if err != nil {
+				return err
+			}
+
+			info, err := d.Info()
+			if err != nil {
+				return err
+			}
+
+			nameInArchive := nameOnDiskToNameInArchive(filename, rootOnDisk, rootInArchive)
+			// this is the root folder and we are adding its contents to target rootInArchive
+			if info.IsDir() && nameInArchive == "" {
+				return nil
+			}
+
+			// handle symbolic links
+			var linkTarget string
+			if isSymlink(info) {
+				if options != nil && options.FollowSymlinks {
+					originalFilename := filename
+					filename, info, err = followSymlink(filename)
+					if err != nil {
+						return err
+					}
+					if info.IsDir() {
+						symlinkDirFiles, err := FilesFromDisk(ctx, options, map[string]string{filename: nameInArchive})
+						if err != nil {
+							return fmt.Errorf("getting files from symlink directory %s dereferenced to %s: %w", originalFilename, linkTarget, err)
+						}
+
+						files = append(files, symlinkDirFiles...)
+						return nil
+					}
+				} else {
+					// preserve symlinks
+					linkTarget, err = os.Readlink(filename)
+					if err != nil {
+						return fmt.Errorf("%s: readlink: %w", filename, err)
+					}
+				}
+			}
+
+			// handle file attributes
+			if options != nil && options.ClearAttributes {
+				info = noAttrFileInfo{info}
+			}
+
+			file := FileInfo{
+				FileInfo:      info,
+				NameInArchive: nameInArchive,
+				LinkTarget:    linkTarget,
+				Open: func() (fs.File, error) {
+					return os.Open(filename)
+				},
+			}
+
+			files = append(files, file)
+
+			return nil
+		})
+		if walkErr != nil {
+			return nil, walkErr
+		}
+	}
+	return files, nil
+}
+
+// nameOnDiskToNameInArchive converts a filename from disk to a name in an archive,
+// respecting rules defined by FilesFromDisk. nameOnDisk is the full filename on disk
+// which is expected to be prefixed by rootOnDisk (according to fs.WalkDirFunc godoc)
+// and which will be placed into a folder rootInArchive in the archive.
+func nameOnDiskToNameInArchive(nameOnDisk, rootOnDisk, rootInArchive string) string {
+	// These manipulations of rootInArchive could be done just once instead of on
+	// every walked file since they don't rely on nameOnDisk which is the only
+	// variable that changes during the walk, but combining all the logic into this
+	// one function is easier to reason about and test. I suspect the performance
+	// penalty is insignificant.
+	if strings.HasSuffix(rootOnDisk, string(filepath.Separator)) {
+		// "map keys that end in a separator will enumerate contents only,
+		// without adding the folder itself to the archive."
+		rootInArchive = trimTopDir(rootInArchive)
+	} else if rootInArchive == "" {
+		// "map values that are empty string are interpreted as the base name
+		// of the file (sans path) in the root of the archive"
+		rootInArchive = filepath.Base(rootOnDisk)
+	}
+	if rootInArchive == "." {
+		// an in-archive root of "." is an escape hatch for the above rule
+		// where an empty in-archive root means to use the base name of the
+		// file; if the user does not want this, they can specify a "." to
+		// still put it in the root of the archive
+		rootInArchive = ""
+	}
+	if strings.HasSuffix(rootInArchive, "/") {
+		// "map values that end in a slash will use the base name of the file in
+		// that folder of the archive."
+		rootInArchive += filepath.Base(rootOnDisk)
+	}
+	truncPath := strings.TrimPrefix(nameOnDisk, rootOnDisk)
+	return path.Join(rootInArchive, filepath.ToSlash(truncPath))
+}
+
+// trimTopDir strips the top or first directory from the path.
+// It expects a forward-slashed path.
+//
+// Examples: "a/b/c" => "b/c", "/a/b/c" => "b/c"
+func trimTopDir(dir string) string {
+	return strings.TrimPrefix(dir, topDir(dir)+"/")
+}
+
+// topDir returns the top or first directory in the path.
+// It expects a forward-slashed path.
+//
+// Examples: "a/b/c" => "a", "/a/b/c" => "/a"
+func topDir(dir string) string {
+	var start int
+	if len(dir) > 0 && dir[0] == '/' {
+		start = 1
+	}
+	if pos := strings.Index(dir[start:], "/"); pos >= 0 {
+		return dir[:pos+start]
+	}
+	return dir
+}
+
+// noAttrFileInfo is used to zero out some file attributes (issue #280).
+type noAttrFileInfo struct{ fs.FileInfo }
+
+// Mode preserves only the type and permission bits.
+func (no noAttrFileInfo) Mode() fs.FileMode {
+	return no.FileInfo.Mode() & (fs.ModeType | fs.ModePerm)
+}
+func (noAttrFileInfo) ModTime() time.Time { return time.Time{} }
+func (noAttrFileInfo) Sys() any           { return nil }
+
+// FromDiskOptions specifies various options for gathering files from disk.
+type FromDiskOptions struct {
+	// If true, symbolic links will be dereferenced, meaning that
+	// the link will not be added as a link, but what the link
+	// points to will be added as a file.
+	FollowSymlinks bool
+
+	// If true, some file attributes will not be preserved.
+	// Name, size, type, and permissions will still be preserved.
+	ClearAttributes bool
+}
+
+// FileHandler is a callback function that is used to handle files as they are read
+// from an archive; it is kind of like fs.WalkDirFunc. Handler functions that open
+// their files must not overlap or run concurrently, as files may be read from the
+// same sequential stream; always close the file before returning.
+//
+// If the special error value fs.SkipDir is returned, the directory of the file
+// (or the file itself if it is a directory) will not be walked. Note that because
+// archive contents are not necessarily ordered, skipping directories requires
+// memory, and skipping lots of directories may run up your memory bill.
+//
+// Any other returned error will terminate a walk and be returned to the caller.
+type FileHandler func(ctx context.Context, info FileInfo) error
+
+// openAndCopyFile opens file for reading, copies its
+// contents to w, then closes file.
+func openAndCopyFile(file FileInfo, w io.Writer) error {
+	fileReader, err := file.Open()
+	if err != nil {
+		return err
+	}
+	defer fileReader.Close()
+	// When file is in use and size is being written to, creating the compressed
+	// file will fail with "archive/tar: write too long." Using CopyN gracefully
+	// handles this.
+	_, err = io.CopyN(w, fileReader, file.Size())
+	if err != nil && err != io.EOF {
+		return err
+	}
+	return nil
+}
+
+// fileIsIncluded returns true if filename is included according to
+// filenameList; meaning it is in the list, its parent folder/path
+// is in the list, or the list is nil.
+func fileIsIncluded(filenameList []string, filename string) bool {
+	// include all files if there is no specific list
+	if filenameList == nil {
+		return true
+	}
+	for _, fn := range filenameList {
+		// exact matches are of course included
+		if filename == fn {
+			return true
+		}
+		// also consider the file included if its parent folder/path is in the list
+		if strings.HasPrefix(filename, strings.TrimSuffix(fn, "/")+"/") {
+			return true
+		}
+	}
+	return false
+}
+
+func isSymlink(info fs.FileInfo) bool {
+	return info.Mode()&os.ModeSymlink != 0
+}
+
+// streamSizeBySeeking determines the size of the stream by
+// seeking to the end, then back again, so the resulting
+// seek position upon returning is the same as when called
+// (assuming no errors).
+func streamSizeBySeeking(s io.Seeker) (int64, error) {
+	currentPosition, err := s.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return 0, fmt.Errorf("getting current offset: %w", err)
+	}
+	maxPosition, err := s.Seek(0, io.SeekEnd)
+	if err != nil {
+		return 0, fmt.Errorf("fast-forwarding to end: %w", err)
+	}
+	_, err = s.Seek(currentPosition, io.SeekStart)
+	if err != nil {
+		return 0, fmt.Errorf("returning to prior offset %d: %w", currentPosition, err)
+	}
+	return maxPosition, nil
+}
+
+// skipList keeps a list of non-intersecting paths
+// as long as its add method is used. Identical
+// elements are rejected, more specific paths are
+// replaced with broader ones, and more specific
+// paths won't be added when a broader one already
+// exists in the list. Trailing slashes are ignored.
+type skipList []string
+
+func (s *skipList) add(dir string) {
+	trimmedDir := strings.TrimSuffix(dir, "/")
+	var dontAdd bool
+	for i := 0; i < len(*s); i++ {
+		trimmedElem := strings.TrimSuffix((*s)[i], "/")
+		if trimmedDir == trimmedElem {
+			return
+		}
+		// don't add dir if a broader path already exists in the list
+		if strings.HasPrefix(trimmedDir, trimmedElem+"/") {
+			dontAdd = true
+			continue
+		}
+		// if dir is broader than a path in the list, remove more specific path in list
+		if strings.HasPrefix(trimmedElem, trimmedDir+"/") {
+			*s = append((*s)[:i], (*s)[i+1:]...)
+			i--
+		}
+	}
+	if !dontAdd {
+		*s = append(*s, dir)
+	}
+}
+
+// followSymlink follows a symlink until it finds a non-symlink,
+// returning the target path, file info, and any error that occurs.
+// It also checks for symlink loops and maximum depth.
+func followSymlink(filename string) (string, os.FileInfo, error) {
+	visited := make(map[string]bool)
+	visited[filename] = true
+	// Limit in Linux kernel: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/namei.c?id=v3.5#n624
+	const maxDepth = 40
+
+	for {
+		linkPath, err := os.Readlink(filename)
+		if err != nil {
+			return "", nil, fmt.Errorf("%s: readlink: %w", filename, err)
+		}
+		if !filepath.IsAbs(linkPath) {
+			linkPath = filepath.Join(filepath.Dir(filename), linkPath)
+		}
+		info, err := os.Lstat(linkPath)
+		if err != nil {
+			return "", nil, fmt.Errorf("%s: statting dereferenced symlink: %w", filename, err)
+		}
+
+		// Not a symlink, we've found the target, return it
+		if info.Mode()&os.ModeSymlink == 0 {
+			return linkPath, info, nil
+		}
+
+		if visited[linkPath] {
+			return "", nil, fmt.Errorf("%s: symlink loop", filename)
+		}
+
+		if len(visited) >= maxDepth {
+			return "", nil, fmt.Errorf("%s: maximum symlink depth (%d) exceeded", filename, maxDepth)
+		}
+
+		visited[linkPath] = true
+		filename = linkPath
+	}
+}
diff --git a/vendor/github.com/mholt/archives/brotli.go b/vendor/github.com/mholt/archives/brotli.go
new file mode 100644
index 0000000000..67c892d397
--- /dev/null
+++ b/vendor/github.com/mholt/archives/brotli.go
@@ -0,0 +1,94 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"strings"
+
+	"github.com/andybalholm/brotli"
+)
+
+func init() {
+	RegisterFormat(Brotli{})
+}
+
+// Brotli facilitates brotli compression.
+type Brotli struct {
+	Quality int
+}
+
+func (Brotli) Extension() string { return ".br" }
+func (Brotli) MediaType() string { return "application/x-br" }
+
+func (br Brotli) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), br.Extension()) {
+		mr.ByName = true
+	}
+
+	if stream != nil {
+		// brotli does not have well-defined file headers or a magic number;
+		// the best way to match the stream is probably to try decoding part
+		// of it, but we'll just have to guess a large-enough size that is
+		// still small enough for the smallest streams we'll encounter
+		input := &bytes.Buffer{}
+		r := brotli.NewReader(io.TeeReader(stream, input))
+		buf := make([]byte, 16)
+
+		// First gauntlet - can the reader even read 16 bytes without an error?
+		n, err := r.Read(buf)
+		if err != nil {
+			return mr, nil
+		}
+		buf = buf[:n]
+		inputBytes := input.Bytes()
+
+		// Second gauntlet - do the decompressed bytes exist in the raw input?
+		// If they don't appear in the first 4 bytes (to account for the up to
+		// 32 bits of initial brotli header) or at all, then chances are the
+		// input was compressed.
+		idx := bytes.Index(inputBytes, buf)
+		if idx < 4 {
+			mr.ByStream = true
+			return mr, nil
+		}
+
+		// The input is assumed to be compressed data, but we still can't be 100% sure.
+		// Try reading more data until we encounter an error.
+		for n < 128 {
+			nn, err := r.Read(buf)
+			switch err {
+			case io.EOF:
+				// If we've reached EOF, we return assuming it's compressed.
+				mr.ByStream = true
+				return mr, nil
+			case io.ErrUnexpectedEOF:
+				// If we've encountered a short read, that's probably due to invalid reads due
+				// to the fact it isn't compressed data at all.
+				return mr, nil
+			case nil:
+				// No error, no problem. Continue reading.
+				n += nn
+			default:
+				// If we encounter any other error, return it.
+				return mr, nil
+			}
+		}
+
+		// If we haven't encountered an error by now, the input is probably compressed.
+		mr.ByStream = true
+	}
+
+	return mr, nil
+}
+
+func (br Brotli) OpenWriter(w io.Writer) (io.WriteCloser, error) {
+	return brotli.NewWriterLevel(w, br.Quality), nil
+}
+
+func (Brotli) OpenReader(r io.Reader) (io.ReadCloser, error) {
+	return io.NopCloser(brotli.NewReader(r)), nil
+}
diff --git a/vendor/github.com/mholt/archives/bz2.go b/vendor/github.com/mholt/archives/bz2.go
new file mode 100644
index 0000000000..ff7bb3db43
--- /dev/null
+++ b/vendor/github.com/mholt/archives/bz2.go
@@ -0,0 +1,52 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"strings"
+
+	"github.com/dsnet/compress/bzip2"
+)
+
+func init() {
+	RegisterFormat(Bz2{})
+}
+
+// Bz2 facilitates bzip2 compression.
+type Bz2 struct {
+	CompressionLevel int
+}
+
+func (Bz2) Extension() string { return ".bz2" }
+func (Bz2) MediaType() string { return "application/x-bzip2" }
+
+func (bz Bz2) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), bz.Extension()) {
+		mr.ByName = true
+	}
+
+	// match file header
+	buf, err := readAtMost(stream, len(bzip2Header))
+	if err != nil {
+		return mr, err
+	}
+	mr.ByStream = bytes.Equal(buf, bzip2Header)
+
+	return mr, nil
+}
+
+func (bz Bz2) OpenWriter(w io.Writer) (io.WriteCloser, error) {
+	return bzip2.NewWriter(w, &bzip2.WriterConfig{
+		Level: bz.CompressionLevel,
+	})
+}
+
+func (Bz2) OpenReader(r io.Reader) (io.ReadCloser, error) {
+	return bzip2.NewReader(r, nil)
+}
+
+var bzip2Header = []byte("BZh")
diff --git a/vendor/github.com/mholt/archives/formats.go b/vendor/github.com/mholt/archives/formats.go
new file mode 100644
index 0000000000..597dd48a56
--- /dev/null
+++ b/vendor/github.com/mholt/archives/formats.go
@@ -0,0 +1,440 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"path"
+	"path/filepath"
+	"strings"
+)
+
+// RegisterFormat registers a format. It should be called during init.
+// Duplicate formats by name are not allowed and will panic.
+func RegisterFormat(format Format) {
+	name := strings.Trim(strings.ToLower(format.Extension()), ".")
+	if _, ok := formats[name]; ok {
+		panic("format " + name + " is already registered")
+	}
+	formats[name] = format
+}
+
+// Identify iterates the registered formats and returns the one that
+// matches the given filename and/or stream. It is capable of identifying
+// compressed files (.gz, .xz...), archive files (.tar, .zip...), and
+// compressed archive files (tar.gz, tar.bz2...). The returned Format
+// value can be type-asserted to ascertain its capabilities.
+//
+// If no matching formats were found, special error NoMatch is returned.
+//
+// If stream is nil then it will only match on file name and the
+// returned io.Reader will be nil.
+//
+// If stream is non-nil, it will be returned in the same read position
+// as it was before Identify() was called, by virtue of buffering the
+// peeked bytes. However, if the stream is an io.Seeker, Seek() must
+// work, no extra buffering will be performed, and the original input
+// value will be returned at the original position by seeking.
+func Identify(ctx context.Context, filename string, stream io.Reader) (Format, io.Reader, error) {
+	var compression Compression
+	var archival Archival
+	var extraction Extraction
+
+	filename = path.Base(filepath.ToSlash(filename))
+
+	rewindableStream, err := newRewindReader(stream)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// try compression format first, since that's the outer "layer" if combined
+	for name, format := range formats {
+		cf, isCompression := format.(Compression)
+		if !isCompression {
+			continue
+		}
+
+		matchResult, err := identifyOne(ctx, format, filename, rewindableStream, nil)
+		if err != nil {
+			return nil, rewindableStream.reader(), fmt.Errorf("matching %s: %w", name, err)
+		}
+
+		// if matched, wrap input stream with decompression
+		// so we can see if it contains an archive within
+		if matchResult.Matched() {
+			compression = cf
+			break
+		}
+	}
+
+	// try archival and extraction formats next
+	for name, format := range formats {
+		ar, isArchive := format.(Archival)
+		ex, isExtract := format.(Extraction)
+		if !isArchive && !isExtract {
+			continue
+		}
+
+		matchResult, err := identifyOne(ctx, format, filename, rewindableStream, compression)
+		if err != nil {
+			return nil, rewindableStream.reader(), fmt.Errorf("matching %s: %w", name, err)
+		}
+
+		if matchResult.Matched() {
+			archival = ar
+			extraction = ex
+			break
+		}
+	}
+
+	// the stream should be rewound by identifyOne; then return the most specific type of match
+	bufferedStream := rewindableStream.reader()
+	switch {
+	case compression != nil && archival == nil && extraction == nil:
+		return compression, bufferedStream, nil
+	case compression == nil && archival != nil && extraction == nil:
+		return archival, bufferedStream, nil
+	case compression == nil && archival == nil && extraction != nil:
+		return extraction, bufferedStream, nil
+	case compression == nil && archival != nil && extraction != nil:
+		// archival and extraction are always set together, so they must be the same
+		return archival, bufferedStream, nil
+	case compression != nil && extraction != nil:
+		// in practice, this is only used for compressed tar files, and the tar format can
+		// both read and write, so the archival value should always work too; but keep in
+		// mind that Identify() is used on existing files to be read, not new files to write
+		return CompressedArchive{archival, extraction, compression}, bufferedStream, nil
+	default:
+		return nil, bufferedStream, NoMatch
+	}
+}
+
+func identifyOne(ctx context.Context, format Format, filename string, stream *rewindReader, comp Compression) (mr MatchResult, err error) {
+	defer stream.rewind()
+
+	if filename == "." {
+		filename = ""
+	}
+
+	// if looking within a compressed format, wrap the stream in a
+	// reader that can decompress it so we can match the "inner" format
+	// (yes, we have to make a new reader every time we do a match,
+	// because we reset/seek the stream each time and that can mess up
+	// the compression reader's state if we don't discard it also)
+	if comp != nil && stream != nil {
+		decompressedStream, openErr := comp.OpenReader(stream)
+		if openErr != nil {
+			return MatchResult{}, openErr
+		}
+		defer decompressedStream.Close()
+		mr, err = format.Match(ctx, filename, decompressedStream)
+	} else {
+		// Make sure we pass a nil io.Reader not a *rewindReader(nil)
+		var r io.Reader
+		if stream != nil {
+			r = stream
+		}
+		mr, err = format.Match(ctx, filename, r)
+	}
+
+	// if the error is EOF, we can just ignore it.
+	// Just means we have a small input file.
+	if errors.Is(err, io.EOF) {
+		err = nil
+	}
+	return mr, err
+}
+
+// readAtMost reads at most n bytes from the stream. A nil, empty, or short
+// stream is not an error. The returned slice of bytes may have length < n
+// without an error.
+func readAtMost(stream io.Reader, n int) ([]byte, error) {
+	if stream == nil || n <= 0 {
+		return []byte{}, nil
+	}
+
+	buf := make([]byte, n)
+	nr, err := io.ReadFull(stream, buf)
+
+	// Return the bytes read if there was no error OR if the
+	// error was EOF (stream was empty) or UnexpectedEOF (stream
+	// had less than n). We ignore those errors because we aren't
+	// required to read the full n bytes; so an empty or short
+	// stream is not actually an error.
+	if err == nil ||
+		errors.Is(err, io.EOF) ||
+		errors.Is(err, io.ErrUnexpectedEOF) {
+		return buf[:nr], nil
+	}
+
+	return nil, err
+}
+
+// CompressedArchive represents an archive which is compressed externally
+// (for example, a gzipped tar file, .tar.gz.) It combines a compression
+// format on top of an archival/extraction format and provides both
+// functionalities in a single type, allowing archival and extraction
+// operations transparently through compression and decompression. However,
+// compressed archives have some limitations; for example, files cannot be
+// inserted/appended because of complexities with modifying existing
+// compression state (perhaps this could be overcome, but I'm not about to
+// try it).
+type CompressedArchive struct {
+	Archival
+	Extraction
+	Compression
+}
+
+// Name returns a concatenation of the archive and compression format extensions.
+func (ca CompressedArchive) Extension() string {
+	var name string
+	if ca.Archival != nil {
+		name += ca.Archival.Extension()
+	} else if ca.Extraction != nil {
+		name += ca.Extraction.Extension()
+	}
+	name += ca.Compression.Extension()
+	return name
+}
+
+// MediaType returns the compression format's MIME type, since
+// a compressed archive is fundamentally a compressed file.
+func (ca CompressedArchive) MediaType() string { return ca.Compression.MediaType() }
+
+// Match matches if the input matches both the compression and archival/extraction format.
+func (ca CompressedArchive) Match(ctx context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var conglomerate MatchResult
+
+	if ca.Compression != nil {
+		matchResult, err := ca.Compression.Match(ctx, filename, stream)
+		if err != nil {
+			return MatchResult{}, err
+		}
+		if !matchResult.Matched() {
+			return matchResult, nil
+		}
+
+		// wrap the reader with the decompressor so we can
+		// attempt to match the archive by reading the stream
+		rc, err := ca.Compression.OpenReader(stream)
+		if err != nil {
+			return matchResult, err
+		}
+		defer rc.Close()
+		stream = rc
+
+		conglomerate = matchResult
+	}
+
+	if ca.Archival != nil {
+		matchResult, err := ca.Archival.Match(ctx, filename, stream)
+		if err != nil {
+			return MatchResult{}, err
+		}
+		if !matchResult.Matched() {
+			return matchResult, nil
+		}
+		conglomerate.ByName = conglomerate.ByName || matchResult.ByName
+		conglomerate.ByStream = conglomerate.ByStream || matchResult.ByStream
+	}
+
+	return conglomerate, nil
+}
+
+// Archive writes an archive to the output stream while compressing the result.
+func (ca CompressedArchive) Archive(ctx context.Context, output io.Writer, files []FileInfo) error {
+	if ca.Archival == nil {
+		return fmt.Errorf("no archival format")
+	}
+	if ca.Compression != nil {
+		wc, err := ca.Compression.OpenWriter(output)
+		if err != nil {
+			return err
+		}
+		defer wc.Close()
+		output = wc
+	}
+	return ca.Archival.Archive(ctx, output, files)
+}
+
+// ArchiveAsync adds files to the output archive while compressing the result asynchronously.
+func (ca CompressedArchive) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan ArchiveAsyncJob) error {
+	if ca.Archival == nil {
+		return fmt.Errorf("no archival format")
+	}
+	do, ok := ca.Archival.(ArchiverAsync)
+	if !ok {
+		return fmt.Errorf("%T archive does not support async writing", ca.Archival)
+	}
+	if ca.Compression != nil {
+		wc, err := ca.Compression.OpenWriter(output)
+		if err != nil {
+			return err
+		}
+		defer wc.Close()
+		output = wc
+	}
+	return do.ArchiveAsync(ctx, output, jobs)
+}
+
+// Extract reads files out of a compressed archive while decompressing the results.
+func (ca CompressedArchive) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error {
+	if ca.Extraction == nil {
+		return fmt.Errorf("no extraction format")
+	}
+	if ca.Compression != nil {
+		rc, err := ca.Compression.OpenReader(sourceArchive)
+		if err != nil {
+			return err
+		}
+		defer rc.Close()
+		sourceArchive = rc
+	}
+	return ca.Extraction.Extract(ctx, sourceArchive, handleFile)
+}
+
+// MatchResult returns true if the format was matched either
+// by name, stream, or both. Name usually refers to matching
+// by file extension, and stream usually refers to reading
+// the first few bytes of the stream (its header). A stream
+// match is generally stronger, as filenames are not always
+// indicative of their contents if they even exist at all.
+type MatchResult struct {
+	ByName, ByStream bool
+}
+
+// Matched returns true if a match was made by either name or stream.
+func (mr MatchResult) Matched() bool { return mr.ByName || mr.ByStream }
+
+func (mr MatchResult) String() string {
+	return fmt.Sprintf("{ByName=%v ByStream=%v}", mr.ByName, mr.ByStream)
+}
+
+// rewindReader is a Reader that can be rewound (reset) to re-read what
+// was already read and then continue to read more from the underlying
+// stream. When no more rewinding is necessary, call reader() to get a
+// new reader that first reads the buffered bytes, then continues to
+// read from the stream. This is useful for "peeking" a stream an
+// arbitrary number of bytes. Loosely based on the Connection type
+// from https://github.com/mholt/caddy-l4.
+//
+// If the reader is also an io.Seeker, no buffer is used, and instead
+// the stream seeks back to the starting position.
+type rewindReader struct {
+	io.Reader
+	start     int64
+	buf       *bytes.Buffer
+	bufReader io.Reader
+}
+
+func newRewindReader(r io.Reader) (*rewindReader, error) {
+	if r == nil {
+		return nil, nil
+	}
+
+	rr := &rewindReader{Reader: r}
+
+	// avoid buffering if we have a seeker we can use
+	if seeker, ok := r.(io.Seeker); ok {
+		var err error
+		rr.start, err = seeker.Seek(0, io.SeekCurrent)
+		if err != nil {
+			return nil, fmt.Errorf("seek to determine current position: %w", err)
+		}
+	} else {
+		rr.buf = new(bytes.Buffer)
+	}
+
+	return rr, nil
+}
+
+func (rr *rewindReader) Read(p []byte) (n int, err error) {
+	if rr == nil {
+		panic("reading from nil rewindReader")
+	}
+
+	// if there is a buffer we should read from, start
+	// with that; we only read from the underlying stream
+	// after the buffer has been "depleted"
+	if rr.bufReader != nil {
+		n, err = rr.bufReader.Read(p)
+		if err == io.EOF {
+			rr.bufReader = nil
+			err = nil
+		}
+		if n == len(p) {
+			return
+		}
+	}
+
+	// buffer has been depleted or we are not using one,
+	// so read from underlying stream
+	nr, err := rr.Reader.Read(p[n:])
+
+	// anything that was read needs to be written to
+	// the buffer (if used), even if there was an error
+	if nr > 0 && rr.buf != nil {
+		if nw, errw := rr.buf.Write(p[n : n+nr]); errw != nil {
+			return nw, errw
+		}
+	}
+
+	// up to now, n was how many bytes were read from
+	// the buffer, and nr was how many bytes were read
+	// from the stream; add them to return total count
+	n += nr
+
+	return
+}
+
+// rewind resets the stream to the beginning by causing
+// Read() to start reading from the beginning of the
+// stream, or, if buffering, the buffered bytes.
+func (rr *rewindReader) rewind() {
+	if rr == nil {
+		return
+	}
+	if ras, ok := rr.Reader.(io.Seeker); ok {
+		if _, err := ras.Seek(rr.start, io.SeekStart); err == nil {
+			return
+		}
+	}
+	rr.bufReader = bytes.NewReader(rr.buf.Bytes())
+}
+
+// reader returns a reader that reads first from the buffered
+// bytes (if buffering), then from the underlying stream; if a
+// Seeker, the stream will be seeked back to the start. After
+// calling this, no more rewinding is allowed since reads from
+// the stream are not recorded, so rewinding properly is impossible.
+// If the underlying reader implements io.Seeker, then the
+// underlying reader will be used directly.
+func (rr *rewindReader) reader() io.Reader {
+	if rr == nil {
+		return nil
+	}
+	if ras, ok := rr.Reader.(io.Seeker); ok {
+		if _, err := ras.Seek(rr.start, io.SeekStart); err == nil {
+			return rr.Reader
+		}
+	}
+	return io.MultiReader(bytes.NewReader(rr.buf.Bytes()), rr.Reader)
+}
+
+// NoMatch is a special error returned if there are no matching formats.
+var NoMatch = fmt.Errorf("no formats matched")
+
+// Registered formats.
+var formats = make(map[string]Format)
+
+// Interface guards
+var (
+	_ Format        = (*CompressedArchive)(nil)
+	_ Archiver      = (*CompressedArchive)(nil)
+	_ ArchiverAsync = (*CompressedArchive)(nil)
+	_ Extractor     = (*CompressedArchive)(nil)
+	_ Compressor    = (*CompressedArchive)(nil)
+	_ Decompressor  = (*CompressedArchive)(nil)
+)
diff --git a/vendor/github.com/mholt/archives/fs.go b/vendor/github.com/mholt/archives/fs.go
new file mode 100644
index 0000000000..f1d3efad7e
--- /dev/null
+++ b/vendor/github.com/mholt/archives/fs.go
@@ -0,0 +1,1161 @@
+package archives
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"os"
+	"path"
+	"path/filepath"
+	"runtime"
+	"slices"
+	"strings"
+	"sync"
+	"time"
+)
+
+// FileSystem identifies the format of the input and returns a read-only file system.
+// The input can be a filename, stream, or both.
+//
+// If only a filename is specified, it may be a path to a directory, archive file,
+// compressed archive file, compressed regular file, or any other regular file on
+// disk. If the filename is a directory, its contents are accessed directly from
+// the device's file system. If the filename is an archive file, the contents can
+// be accessed like a normal directory; compressed archive files are transparently
+// decompressed as contents are accessed. And if the filename is any other file, it
+// is the only file in the returned file system; if the file is compressed, it is
+// transparently decompressed when read from.
+//
+// If a stream is specified, the filename (if available) is used as a hint to help
+// identify its format. Streams of archive files must be able to be made into an
+// io.SectionReader (for safe concurrency) which requires io.ReaderAt and io.Seeker
+// (to efficiently determine size). The automatic format identification requires
+// io.Reader and will use io.Seeker if supported to avoid buffering.
+//
+// Whether the data comes from disk or a stream, it is peeked at to automatically
+// detect which format to use.
+//
+// This function essentially offers uniform read access to various kinds of files:
+// directories, archives, compressed archives, individual files, and file streams
+// are all treated the same way.
+//
+// NOTE: The performance of compressed tar archives is not great due to overhead
+// with decompression. However, the fs.WalkDir() use case has been optimized to
+// create an index on first call to ReadDir().
+func FileSystem(ctx context.Context, filename string, stream ReaderAtSeeker) (fs.FS, error) {
+	if filename == "" && stream == nil {
+		return nil, errors.New("no input")
+	}
+
+	// if an input stream is specified, we'll use that for identification
+	// and for ArchiveFS (if it's an archive); but if not, we'll open the
+	// file and read it for identification, but in that case we won't want
+	// to also use it for the ArchiveFS (because we need to close what we
+	// opened, and ArchiveFS opens its own files), hence this separate var
+	idStream := stream
+
+	// if input is only a filename (no stream), check if it's a directory;
+	// if not, open it so we can determine which format to use (filename
+	// is not always a good indicator of file format)
+	if filename != "" && stream == nil {
+		info, err := os.Stat(filename)
+		if err != nil {
+			return nil, err
+		}
+
+		// real folders can be accessed easily
+		if info.IsDir() {
+			return DirFS(filename), nil
+		}
+
+		// if any archive formats recognize this file, access it like a folder
+		file, err := os.Open(filename)
+		if err != nil {
+			return nil, err
+		}
+		defer file.Close()
+		idStream = file // use file for format identification only
+	}
+
+	// normally, callers should use the Reader value returned from Identify, but
+	// our input is a Seeker, so we know the original input value gets returned
+	format, _, err := Identify(ctx, filepath.Base(filename), idStream)
+	if errors.Is(err, NoMatch) {
+		return FileFS{Path: filename}, nil // must be an ordinary file
+	}
+	if err != nil {
+		return nil, fmt.Errorf("identify format: %w", err)
+	}
+
+	switch fileFormat := format.(type) {
+	case Extractor:
+		// if no stream was input, return an ArchiveFS that relies on the filepath
+		if stream == nil {
+			return &ArchiveFS{Path: filename, Format: fileFormat, Context: ctx}, nil
+		}
+
+		// otherwise, if a stream was input, return an ArchiveFS that relies on that
+
+		// determine size -- we know that the stream value we get back from
+		// Identify is the same type as what we input because it is a Seeker
+		size, err := streamSizeBySeeking(stream)
+		if err != nil {
+			return nil, fmt.Errorf("seeking for size: %w", err)
+		}
+
+		sr := io.NewSectionReader(stream, 0, size)
+
+		return &ArchiveFS{Stream: sr, Format: fileFormat, Context: ctx}, nil
+
+	case Compression:
+		return FileFS{Path: filename, Compression: fileFormat}, nil
+	}
+
+	return nil, fmt.Errorf("unable to create file system rooted at %s due to unsupported file or folder type", filename)
+}
+
+// ReaderAtSeeker is a type that can read, read at, and seek.
+// os.File and io.SectionReader both implement this interface.
+type ReaderAtSeeker interface {
+	io.Reader
+	io.ReaderAt
+	io.Seeker
+}
+
+// FileFS allows accessing a file on disk using a consistent file system interface.
+// The value should be the path to a regular file, not a directory. This file will
+// be the only entry in the file system and will be at its root. It can be accessed
+// within the file system by the name of "." or the filename.
+//
+// If the file is compressed, set the Compression field so that reads from the
+// file will be transparently decompressed.
+type FileFS struct {
+	// The path to the file on disk.
+	Path string
+
+	// If file is compressed, setting this field will
+	// transparently decompress reads.
+	Compression Decompressor
+}
+
+// Open opens the named file, which must be the file used to create the file system.
+func (f FileFS) Open(name string) (fs.File, error) {
+	if err := f.checkName(name, "open"); err != nil {
+		return nil, err
+	}
+	file, err := os.Open(f.Path)
+	if err != nil {
+		return nil, err
+	}
+	if f.Compression == nil {
+		return file, nil
+	}
+	r, err := f.Compression.OpenReader(file)
+	if err != nil {
+		return nil, err
+	}
+	return compressedFile{r, closeBoth{file, r}}, nil
+}
+
+// Stat stats the named file, which must be the file used to create the file system.
+func (f FileFS) Stat(name string) (fs.FileInfo, error) {
+	if err := f.checkName(name, "stat"); err != nil {
+		return nil, err
+	}
+	return os.Stat(f.Path)
+}
+
+// ReadDir returns a directory listing with the file as the singular entry.
+func (f FileFS) ReadDir(name string) ([]fs.DirEntry, error) {
+	if err := f.checkName(name, "stat"); err != nil {
+		return nil, err
+	}
+	info, err := f.Stat(name)
+	if err != nil {
+		return nil, err
+	}
+	return []fs.DirEntry{fs.FileInfoToDirEntry(info)}, nil
+}
+
+// checkName ensures the name is a valid path and also, in the case of
+// the FileFS, that it is either ".", the filename originally passed in
+// to create the FileFS, or the base of the filename (name without path).
+// Other names do not make sense for a FileFS since the FS is only 1 file.
+func (f FileFS) checkName(name, op string) error {
+	if name == f.Path {
+		return nil
+	}
+	if !fs.ValidPath(name) {
+		return &fs.PathError{Op: op, Path: name, Err: fs.ErrInvalid}
+	}
+	if name != "." && name != filepath.Base(f.Path) {
+		return &fs.PathError{Op: op, Path: name, Err: fs.ErrNotExist}
+	}
+	return nil
+}
+
+// compressedFile is an fs.File that specially reads
+// from a decompression reader, and which closes both
+// that reader and the underlying file.
+type compressedFile struct {
+	io.Reader // decompressor
+	closeBoth // file and decompressor
+}
+
+// DirFS is similar to os.dirFS (obtained via os.DirFS()), but it is
+// exported so it can be used with type assertions. It also returns
+// FileInfo/DirEntry values where Name() always returns the name of
+// the directory instead of ".". This type does not guarantee any
+// sort of sandboxing.
+type DirFS string
+
+// Open opens the named file.
+func (d DirFS) Open(name string) (fs.File, error) {
+	if err := d.checkName(name, "open"); err != nil {
+		return nil, err
+	}
+	return os.Open(filepath.Join(string(d), name))
+}
+
+// ReadDir returns a listing of all the files in the named directory.
+func (d DirFS) ReadDir(name string) ([]fs.DirEntry, error) {
+	if err := d.checkName(name, "readdir"); err != nil {
+		return nil, err
+	}
+	return os.ReadDir(filepath.Join(string(d), name))
+}
+
+// Stat returns info about the named file.
+func (d DirFS) Stat(name string) (fs.FileInfo, error) {
+	if err := d.checkName(name, "stat"); err != nil {
+		return nil, err
+	}
+	info, err := os.Stat(filepath.Join(string(d), name))
+	if err != nil {
+		return info, err
+	}
+	if info.Name() == "." {
+		info = dotFileInfo{info, filepath.Base(string(d))}
+	}
+	return info, nil
+}
+
+// Sub returns an FS corresponding to the subtree rooted at dir.
+func (d DirFS) Sub(dir string) (fs.FS, error) {
+	if err := d.checkName(dir, "sub"); err != nil {
+		return nil, err
+	}
+	info, err := d.Stat(dir)
+	if err != nil {
+		return nil, err
+	}
+	if !info.IsDir() {
+		return nil, fmt.Errorf("%s is not a directory", dir)
+	}
+	return DirFS(filepath.Join(string(d), dir)), nil
+}
+
+// checkName returns an error if name is not a valid path according to the docs of
+// the io/fs package, with an extra cue taken from the standard lib's implementation
+// of os.dirFS.Open(), which checks for invalid characters in Windows paths.
+func (DirFS) checkName(name, op string) error {
+	if !fs.ValidPath(name) || runtime.GOOS == "windows" && strings.ContainsAny(name, `\:`) {
+		return &fs.PathError{Op: op, Path: name, Err: fs.ErrInvalid}
+	}
+	return nil
+}
+
+// ArchiveFS allows reading an archive (or a compressed archive) using a
+// consistent file system interface. Essentially, it allows traversal and
+// reading of archive contents the same way as any normal directory on disk.
+// The contents of compressed archives are transparently decompressed.
+//
+// A valid ArchiveFS value must set either Path or Stream, but not both.
+// If Path is set, a literal file will be opened from the disk.
+// If Stream is set, new SectionReaders will be implicitly created to
+// access the stream, enabling safe, concurrent access.
+//
+// NOTE: Due to Go's file system APIs (see package io/fs), the performance
+// of ArchiveFS can suffer when using fs.WalkDir(). To mitigate this,
+// an optimized fs.ReadDirFS has been implemented that indexes the entire
+// archive on the first call to ReadDir() (since the entire archive needs
+// to be walked for every call to ReadDir() anyway, as archive contents are
+// often unordered). The first call to ReadDir(), i.e. near the start of the
+// walk, will be slow for large archives, but should be instantaneous after.
+// If you don't care about walking a file system in directory order, consider
+// calling Extract() on the underlying archive format type directly, which
+// walks the archive in entry order, without needing to do any sorting.
+//
+// Note that fs.FS implementations, including this one, reject paths starting
+// with "./". This can be problematic sometimes, as it is not uncommon for
+// tarballs to contain a top-level/root directory literally named ".", which
+// can happen if a tarball is created in the same directory it is archiving.
+// The underlying Extract() calls are faithful to entries with this name,
+// but file systems have certain semantics around "." that restrict its use.
+// For example, a file named "." cannot be created on a real file system
+// because it is a special name that means "current directory".
+//
+// We had to decide whether to honor the true name in the archive, or honor
+// file system semantics. Given that this is a virtual file system and other
+// code using the fs.FS APIs will trip over a literal directory named ".",
+// we choose to honor file system semantics. Files named "." are ignored;
+// directories with this name are effectively transparent; their contents
+// get promoted up a directory/level. This means a file at "./x" where "."
+// is a literal directory name, its name will be passed in as "x" in
+// WalkDir callbacks. If you need the raw, uninterpeted values from an
+// archive, use the formats' Extract() method directly. See
+// https://github.com/golang/go/issues/70155 for a little more background.
+//
+// This does have one negative edge case... a tar containing contents like
+// [x . ./x] will have a conflict on the file named "x" because "./x" will
+// also be accessed with the name of "x".
+type ArchiveFS struct {
+	// set one of these
+	Path   string            // path to the archive file on disk, or...
+	Stream *io.SectionReader // ...stream from which to read archive
+
+	Format  Extractor       // the archive format
+	Prefix  string          // optional subdirectory in which to root the fs
+	Context context.Context // optional; mainly for cancellation
+
+	// amortizing cache speeds up walks (esp. ReadDir)
+	contents map[string]fs.FileInfo
+	dirs     map[string][]fs.DirEntry
+}
+
+// context always return a context, preferring f.Context if not nil.
+func (f ArchiveFS) context() context.Context {
+	if f.Context != nil {
+		return f.Context
+	}
+	return context.Background()
+}
+
+// Open opens the named file from within the archive. If name is "." then
+// the archive file itself will be opened as a directory file.
+func (f ArchiveFS) Open(name string) (fs.File, error) {
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)}
+	}
+
+	// apply prefix if fs is rooted in a subtree
+	name = path.Join(f.Prefix, name)
+
+	// if we've already indexed the archive, we can know quickly if the file doesn't exist,
+	// and we can also return directory files with their entries instantly
+	if f.contents != nil {
+		if info, found := f.contents[name]; found {
+			if info.IsDir() {
+				if entries, ok := f.dirs[name]; ok {
+					return &dirFile{info: info, entries: entries}, nil
+				}
+			}
+		} else {
+			if entries, found := f.dirs[name]; found {
+				return &dirFile{info: implicitDirInfo{implicitDirEntry{name}}, entries: entries}, nil
+			}
+			return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("open %s: %w", name, fs.ErrNotExist)}
+		}
+	}
+
+	// if a filename is specified, open the archive file
+	var archiveFile *os.File
+	var err error
+	if f.Stream == nil {
+		archiveFile, err = os.Open(f.Path)
+		if err != nil {
+			return nil, err
+		}
+		defer func() {
+			// close the archive file if extraction failed; we can only
+			// count on the user/caller closing it if they successfully
+			// got the handle to the extracted file
+			if err != nil {
+				archiveFile.Close()
+			}
+		}()
+	} else if f.Stream == nil {
+		return nil, fmt.Errorf("no input; one of Path or Stream must be set")
+	}
+
+	// handle special case of opening the archive root
+	if name == "." {
+		var archiveInfo fs.FileInfo
+		if archiveFile != nil {
+			archiveInfo, err = archiveFile.Stat()
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			archiveInfo = implicitDirInfo{
+				implicitDirEntry{"."},
+			}
+		}
+		var entries []fs.DirEntry
+		entries, err = f.ReadDir(name)
+		if err != nil {
+			return nil, err
+		}
+		if archiveFile != nil {
+			// the archiveFile is closed at return only if there's an
+			// error; in this case, though, we can close it regardless
+			if err := archiveFile.Close(); err != nil {
+				return nil, err
+			}
+		}
+		return &dirFile{
+			info:    dirFileInfo{archiveInfo},
+			entries: entries,
+		}, nil
+	}
+
+	var inputStream io.Reader
+	if f.Stream == nil {
+		inputStream = archiveFile
+	} else {
+		inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size())
+	}
+
+	var decompressor io.ReadCloser
+	if decomp, ok := f.Format.(Decompressor); ok && decomp != nil {
+		decompressor, err = decomp.OpenReader(inputStream)
+		if err != nil {
+			return nil, err
+		}
+		inputStream = decompressor
+	}
+
+	// prepare the handler that we'll need if we have to iterate the
+	// archive to find the file being requested
+	var fsFile fs.File
+	handler := func(ctx context.Context, file FileInfo) error {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+
+		// paths in archives can't necessarily be trusted; also clean up any "./" prefix
+		file.NameInArchive = path.Clean(file.NameInArchive)
+
+		// ignore this entry if it's neither the file we're looking for, nor
+		// one of its descendents; we can't just check that the filename is
+		// a prefix of the requested file, because that could wrongly match
+		// "a/b/c.jpg.json" if the requested filename is "a/b/c.jpg", and
+		// this could result in loading the wrong file (!!) so we append a
+		// path separator to ensure that can't happen: "a/b/c.jpg.json/"
+		// is not prefixed by "a/b/c.jpg/", but it will still match as we
+		// expect: "a/b/c/d/" is is prefixed by "a/b/c/", allowing us to
+		// match descenedent files, and "a/b/c.jpg/" is prefixed by
+		// "a/b/c.jpg/", allowing us to match exact filenames.
+		if !strings.HasPrefix(file.NameInArchive+"/", name+"/") {
+			return nil
+		}
+
+		// if this is the requested file, and it's a directory, set up the dirFile,
+		// which will include a listing of all its contents as we continue iterating
+		if file.NameInArchive == name && file.IsDir() {
+			fsFile = &dirFile{info: file} // will fill entries slice as we continue iterating
+			return nil
+		}
+
+		// if the named file was a directory and we are filling its entries,
+		// add this entry to the list
+		if df, ok := fsFile.(*dirFile); ok {
+			df.entries = append(df.entries, fs.FileInfoToDirEntry(file))
+
+			// don't traverse into subfolders
+			if file.IsDir() {
+				return fs.SkipDir
+			}
+
+			return nil
+		}
+
+		innerFile, err := file.Open()
+		if err != nil {
+			return err
+		}
+
+		fsFile = innerFile
+		if archiveFile != nil {
+			fsFile = closeBoth{File: innerFile, c: archiveFile}
+		}
+
+		if decompressor != nil {
+			fsFile = closeBoth{fsFile, decompressor}
+		}
+
+		return fs.SkipAll
+	}
+
+	// when we start the walk, we pass in a nil list of files to extract, since
+	// files may have a "." component in them, and the underlying format doesn't
+	// know about our file system semantics, so we need to filter ourselves (it's
+	// not significantly less efficient).
+	if ar, ok := f.Format.(CompressedArchive); ok {
+		// bypass the CompressedArchive format's opening of the decompressor, since
+		// we already did it because we need to keep it open after returning.
+		// "I BYPASSED THE COMPRESSOR!" -Rey
+		err = ar.Extraction.Extract(f.context(), inputStream, handler)
+	} else {
+		err = f.Format.Extract(f.context(), inputStream, handler)
+	}
+	if err != nil {
+		return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("extract: %w", err)}
+	}
+	if fsFile == nil {
+		return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("open %s: %w", name, fs.ErrNotExist)}
+	}
+
+	return fsFile, nil
+}
+
+// Stat stats the named file from within the archive. If name is "." then
+// the archive file itself is statted and treated as a directory file.
+func (f ArchiveFS) Stat(name string) (fs.FileInfo, error) {
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("%s: %w", name, fs.ErrInvalid)}
+	}
+
+	if name == "." {
+		if f.Path != "" {
+			fileInfo, err := os.Stat(f.Path)
+			if err != nil {
+				return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat(a) %s: %w", name, err)}
+			}
+			return dirFileInfo{fileInfo}, nil
+		} else if f.Stream != nil {
+			return implicitDirInfo{implicitDirEntry{name}}, nil
+		}
+	}
+
+	// apply prefix if fs is rooted in a subtree
+	name = path.Join(f.Prefix, name)
+
+	// if archive has already been indexed, simply use it
+	if f.contents != nil {
+		if info, ok := f.contents[name]; ok {
+			return info, nil
+		}
+		return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat(b) %s: %w", name, fs.ErrNotExist)}
+	}
+
+	var archiveFile *os.File
+	var err error
+	if f.Stream == nil {
+		archiveFile, err = os.Open(f.Path)
+		if err != nil {
+			return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat(c) %s: %w", name, err)}
+		}
+		defer archiveFile.Close()
+	}
+
+	var result FileInfo
+	var fallback fs.FileInfo // possibly needed if only an implied directory
+	handler := func(ctx context.Context, file FileInfo) error {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+		cleanName := path.Clean(file.NameInArchive)
+		if cleanName == name {
+			result = file
+			return fs.SkipAll
+		}
+		// it's possible the requested name is an implicit directory;
+		// remember if we see it along the way, just in case
+		if fallback == nil && strings.HasPrefix(cleanName, name) {
+			fallback = implicitDirInfo{implicitDirEntry{name}}
+		}
+		return nil
+	}
+	var inputStream io.Reader = archiveFile
+	if f.Stream != nil {
+		inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size())
+	}
+	err = f.Format.Extract(f.context(), inputStream, handler)
+	if err != nil && result.FileInfo == nil {
+		return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat(d) %s: %w", name, fs.ErrNotExist)}
+	}
+	if result.FileInfo == nil {
+		// looks like the requested name does not exist in the archive,
+		// but we can return some basic info if it was an implicit directory
+		if fallback != nil {
+			return fallback, nil
+		}
+		return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("stat(e) %s: %w", name, fs.ErrNotExist)}
+	}
+	return result.FileInfo, nil
+}
+
+// ReadDir reads the named directory from within the archive. If name is "."
+// then the root of the archive content is listed.
+func (f *ArchiveFS) ReadDir(name string) ([]fs.DirEntry, error) {
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "readdir", Path: name, Err: fs.ErrInvalid}
+	}
+
+	// apply prefix if fs is rooted in a subtree
+	name = path.Join(f.Prefix, name)
+
+	// fs.WalkDir() calls ReadDir() once per directory, and for archives with
+	// lots of directories, that is very slow, since we have to traverse the
+	// entire archive in order to ensure that we got all the entries for a
+	// directory -- so we can fast-track this lookup if we've done the
+	// traversal already
+	if len(f.dirs) > 0 {
+		return f.dirs[name], nil
+	}
+
+	f.contents = make(map[string]fs.FileInfo)
+	f.dirs = make(map[string][]fs.DirEntry)
+
+	var archiveFile *os.File
+	var err error
+	if f.Stream == nil {
+		archiveFile, err = os.Open(f.Path)
+		if err != nil {
+			return nil, err
+		}
+		defer archiveFile.Close()
+	}
+
+	handler := func(ctx context.Context, file FileInfo) error {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+
+		// can't always trust path names
+		file.NameInArchive = path.Clean(file.NameInArchive)
+
+		// avoid infinite walk; apparently, creating a tar file in the target
+		// directory may result in an entry called "." in the archive; see #384
+		if file.NameInArchive == "." {
+			return nil
+		}
+
+		// if the name being requested isn't a directory, return an error similar to
+		// what most OSes return from the readdir system call when given a non-dir
+		if file.NameInArchive == name && !file.IsDir() {
+			return &fs.PathError{Op: "readdir", Path: name, Err: errors.New("not a directory")}
+		}
+
+		// index this file info for quick access
+		f.contents[file.NameInArchive] = file
+
+		// amortize the DirEntry list per directory, and prefer the real entry's DirEntry over an implicit/fake
+		// one we may have created earlier; first try to find if it exists, and if so, replace the value;
+		// otherwise insert it in sorted position
+		dir := path.Dir(file.NameInArchive)
+		dirEntry := fs.FileInfoToDirEntry(file)
+		idx, found := slices.BinarySearchFunc(f.dirs[dir], dirEntry, func(a, b fs.DirEntry) int {
+			return strings.Compare(a.Name(), b.Name())
+		})
+		if found {
+			f.dirs[dir][idx] = dirEntry
+		} else {
+			f.dirs[dir] = slices.Insert(f.dirs[dir], idx, dirEntry)
+		}
+
+		// this loop looks like an abomination, but it's really quite simple: we're
+		// just iterating the directories of the path up to the root; i.e. we lob off
+		// the base (last component) of the path until no separators remain, i.e. only
+		// one component remains -- then loop again to make sure it's not a duplicate
+		// (start without the base, since we know the full filename is an actual entry
+		// in the archive, we don't need to create an implicit directory entry for it)
+		startingPath := strings.TrimPrefix(path.Dir(file.NameInArchive), "/") // see issue #31
+		for dir, base := path.Dir(startingPath), path.Base(startingPath); base != "."; dir, base = path.Dir(dir), path.Base(dir) {
+			if err := ctx.Err(); err != nil {
+				return err
+			}
+
+			var dirInfo fs.DirEntry = implicitDirInfo{implicitDirEntry{base}}
+
+			// we are "filling in" any directories that could potentially be only implicit,
+			// and since a nested directory can have more than 1 item, we need to prevent
+			// duplication; for example: given a/b/c and a/b/d, we need to avoid adding
+			// an entry for "b" twice within "a" -- hence we search for it first, and if
+			// it doesn't already exist, we insert it in sorted position
+			idx, found := slices.BinarySearchFunc(f.dirs[dir], dirInfo, func(a, b fs.DirEntry) int {
+				return strings.Compare(a.Name(), b.Name())
+			})
+			if !found {
+				f.dirs[dir] = slices.Insert(f.dirs[dir], idx, dirInfo)
+			}
+		}
+
+		return nil
+	}
+
+	var inputStream io.Reader = archiveFile
+	if f.Stream != nil {
+		inputStream = io.NewSectionReader(f.Stream, 0, f.Stream.Size())
+	}
+
+	err = f.Format.Extract(f.context(), inputStream, handler)
+	if err != nil {
+		// these being non-nil implies that we have indexed the archive,
+		// but if an error occurred, we likely only got part of the way
+		// through and our index is incomplete, and we'd have to re-walk
+		// the whole thing anyway; so reset these to nil to avoid bugs
+		f.dirs = nil
+		f.contents = nil
+		return nil, fmt.Errorf("extract: %w", err)
+	}
+
+	return f.dirs[name], nil
+}
+
+// Sub returns an FS corresponding to the subtree rooted at dir.
+func (f *ArchiveFS) Sub(dir string) (fs.FS, error) {
+	if !fs.ValidPath(dir) {
+		return nil, &fs.PathError{Op: "sub", Path: dir, Err: fs.ErrInvalid}
+	}
+	info, err := f.Stat(dir)
+	if err != nil {
+		return nil, err
+	}
+	if !info.IsDir() {
+		return nil, fmt.Errorf("%s is not a directory", dir)
+	}
+	// result is the same as what we're starting with, except
+	// we indicate a path prefix to be used for all operations;
+	// the reason we don't append to the Path field directly
+	// is because the input might be a stream rather than a
+	// path on disk, and the Prefix field is applied on both
+	result := f
+	result.Prefix = dir
+	return result, nil
+}
+
+// DeepFS is a fs.FS that represents the real file system, but also has
+// the ability to traverse into archive files as if they were part of the
+// regular file system. If a filename component ends with an archive
+// extension (e.g. .zip, .tar, .tar.gz, etc.), then the remainder of the
+// filepath will be considered to be inside that archive.
+//
+// This allows treating archive files transparently as if they were part
+// of the regular file system during a walk, which can be extremely useful
+// for accessing data in an "ordinary" walk of the disk, without needing to
+// first extract all the archives and use more disk space.
+//
+// Archives within archives are not supported.
+//
+// The listing of archive entries is retained for the lifetime of the
+// DeepFS value for efficiency, but this can use more memory if archives
+// contain a lot of files.
+//
+// The exported fields may be changed during the lifetime of a DeepFS value
+// (but not concurrently). It is safe to use this type as an FS concurrently.
+type DeepFS struct {
+	// The root filepath using OS separator, even if it
+	// traverses into an archive.
+	Root string
+
+	// An optional context, mainly for cancellation.
+	Context context.Context
+
+	// remember archive file systems for efficiency
+	inners map[string]fs.FS
+	mu     sync.Mutex
+}
+
+func (fsys *DeepFS) Open(name string) (fs.File, error) {
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)}
+	}
+	name = path.Join(filepath.ToSlash(fsys.Root), name)
+	realPath, innerPath := fsys.SplitPath(name)
+	if innerPath != "" {
+		if innerFsys := fsys.getInnerFsys(realPath); innerFsys != nil {
+			return innerFsys.Open(innerPath)
+		}
+	}
+	return os.Open(realPath)
+}
+
+func (fsys *DeepFS) Stat(name string) (fs.FileInfo, error) {
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)}
+	}
+	name = path.Join(filepath.ToSlash(fsys.Root), name)
+	realPath, innerPath := fsys.SplitPath(name)
+	if innerPath != "" {
+		if innerFsys := fsys.getInnerFsys(realPath); innerFsys != nil {
+			return fs.Stat(innerFsys, innerPath)
+		}
+	}
+	return os.Stat(realPath)
+}
+
+// ReadDir returns the directory listing for the given directory name,
+// but for any entries that appear by their file extension to be archive
+// files, they are slightly modified to always return true for IsDir(),
+// since we have the unique ability to list the contents of archives as
+// if they were directories.
+func (fsys *DeepFS) ReadDir(name string) ([]fs.DirEntry, error) {
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "readdir", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)}
+	}
+	name = path.Join(filepath.ToSlash(fsys.Root), name)
+	realPath, innerPath := fsys.SplitPath(name)
+	if innerPath != "" {
+		if innerFsys := fsys.getInnerFsys(realPath); innerFsys != nil {
+			return fs.ReadDir(innerFsys, innerPath)
+		}
+	}
+	entries, err := os.ReadDir(realPath)
+	if err != nil {
+		return nil, err
+	}
+	// make sure entries that appear to be archive files indicate they are a directory
+	// so the fs package will try to walk them
+	for i, entry := range entries {
+		if PathIsArchive(entry.Name()) {
+			entries[i] = alwaysDirEntry{entry}
+		}
+	}
+	return entries, nil
+}
+
+// getInnerFsys reuses "inner" file systems, because for example, archives.ArchiveFS
+// amortizes directory entries with the first call to ReadDir; if we don't reuse the
+// file systems then they have to rescan the same archive multiple times.
+func (fsys *DeepFS) getInnerFsys(realPath string) fs.FS {
+	realPath = filepath.Clean(realPath)
+
+	fsys.mu.Lock()
+	defer fsys.mu.Unlock()
+
+	if fsys.inners == nil {
+		fsys.inners = make(map[string]fs.FS)
+	} else if innerFsys, ok := fsys.inners[realPath]; ok {
+		return innerFsys
+	}
+	innerFsys, err := FileSystem(fsys.context(), realPath, nil)
+	if err == nil {
+		fsys.inners[realPath] = innerFsys
+		return innerFsys
+	}
+	return nil
+}
+
+// SplitPath splits a file path into the "real" path and the "inner" path components,
+// where the split point is the first extension of an archive filetype like ".zip" or
+// ".tar.gz" that occurs in the path.
+//
+// The real path is the path that can be accessed on disk and will be returned with
+// platform filepath separators. The inner path is the io/fs-compatible path that can
+// be used within the archive.
+//
+// If no archive extension is found in the path, only the realPath is returned.
+// If the input path is precisely an archive file (i.e. ends with an archive file
+// extension), then innerPath is returned as "." which indicates the root of the archive.
+func (*DeepFS) SplitPath(path string) (realPath, innerPath string) {
+	if len(path) < 2 {
+		realPath = path
+		return
+	}
+
+	// slightly more LoC, but more efficient, than exploding the path on every slash,
+	// is segmenting the path by using indices and looking at slices of the same
+	// string on every iteration; this avoids many allocations which can be valuable
+	// since this can be a hot path
+
+	// start at 1 instead of 0 because we know if the first slash is at 0, the part will be empty
+	start, end := 1, strings.Index(path[1:], "/")+1
+	if end-start < 0 {
+		end = len(path)
+	}
+
+	for {
+		part := strings.TrimRight(strings.ToLower(path[start:end]), " ")
+		if PathIsArchive(part) {
+			// we've found an archive extension, so the path until the end of this segment is
+			// the "real" OS path, and what remains (if anything( is the path within the archive
+			realPath = filepath.Clean(filepath.FromSlash(path[:end]))
+
+			if end < len(path) {
+				innerPath = path[end+1:]
+			} else {
+				// signal to the caller that this is an archive,
+				// even though it is the very root of the archive
+				innerPath = "."
+			}
+			return
+
+		}
+
+		// advance to the next segment, or end of string
+		start = end + 1
+		if start > len(path) {
+			break
+		}
+		end = strings.Index(path[start:], "/") + start
+		if end-start < 0 {
+			end = len(path)
+		}
+	}
+
+	// no archive extension found, so entire path is real path
+	realPath = filepath.Clean(filepath.FromSlash(path))
+	return
+}
+
+func (fsys *DeepFS) context() context.Context {
+	if fsys.Context != nil {
+		return fsys.Context
+	}
+	return context.Background()
+}
+
+// alwaysDirEntry always returns true for IsDir(). Because
+// DeepFS is able to walk archive files as directories,
+// this is used to trick fs.WalkDir to think they are
+// directories and thus traverse into them.
+type alwaysDirEntry struct {
+	fs.DirEntry
+}
+
+func (alwaysDirEntry) IsDir() bool { return true }
+
+// archiveExtensions contains extensions for popular and supported
+// archive types; sorted by popularity and with respect to some
+// being prefixed by other extensions.
+var archiveExtensions = []string{
+	".zip",
+	".tar",
+	".tgz",
+	".tar.gz",
+	".tar.bz2",
+	".tar.zst",
+	".tar.lz4",
+	".tar.xz",
+	".tar.sz",
+	".tar.s2",
+	".tar.lz",
+}
+
+// PathIsArchive returns true if the path ends with an archive file (i.e.
+// whether the path traverse to an archive) solely by lexical analysis (no
+// reading the files or headers is performed).
+func PathIsArchive(path string) bool {
+	// normalize the extension
+	path = strings.ToLower(path)
+	for _, ext := range archiveExtensions {
+		// Check the full ext
+		if strings.HasSuffix(path, ext) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// PathContainsArchive returns true if the path contains an archive file (i.e.
+// whether the path traverses into an archive) solely by lexical analysis (no
+// reading of files or headers is performed). Such a path is not typically
+// usable by the OS, but can be used by the DeepFS type. Slash must be the
+// path component separator. Example: "/foo/example.zip/path/in/archive"
+func PathContainsArchive(path string) bool {
+	pathPlusSep := path + "/"
+	for _, ext := range archiveExtensions {
+		if strings.Contains(pathPlusSep, ext+"/") {
+			return true
+		}
+	}
+	return false
+}
+
+// TopDirOpen is a special Open() function that may be useful if
+// a file system root was created by extracting an archive.
+//
+// It first tries the file name as given, but if that returns an
+// error, it tries the name without the first element of the path.
+// In other words, if "a/b/c" returns an error, then "b/c" will
+// be tried instead.
+//
+// Consider an archive that contains a file "a/b/c". When the
+// archive is extracted, the contents may be created without a
+// new parent/root folder to contain them, and the path of the
+// same file outside the archive may be lacking an exclusive root
+// or parent container. Thus it is likely for a file system
+// created for the same files extracted to disk to be rooted at
+// one of the top-level files/folders from the archive instead of
+// a parent folder. For example, the file known as "a/b/c" when
+// rooted at the archive becomes "b/c" after extraction when rooted
+// at "a" on disk (because no new, exclusive top-level folder was
+// created). This difference in paths can make it difficult to use
+// archives and directories uniformly. Hence these TopDir* functions
+// which attempt to smooth over the difference.
+//
+// Some extraction utilities do create a container folder for
+// archive contents when extracting, in which case the user
+// may give that path as the root. In that case, these TopDir*
+// functions are not necessary (but aren't harmful either). They
+// are primarily useful if you are not sure whether the root is
+// an archive file or is an extracted archive file, as they will
+// work with the same filename/path inputs regardless of the
+// presence of a top-level directory.
+//
+// EXPERIMENTAL: Subject to change or removal even after stable release.
+func TopDirOpen(fsys fs.FS, name string) (fs.File, error) {
+	file, err := fsys.Open(name)
+	if err == nil {
+		return file, nil
+	}
+	return fsys.Open(pathWithoutTopDir(name))
+}
+
+// TopDirStat is like TopDirOpen but for Stat.
+//
+// EXPERIMENTAL: Subject to change or removal even after stable release.
+func TopDirStat(fsys fs.FS, name string) (fs.FileInfo, error) {
+	info, err := fs.Stat(fsys, name)
+	if err == nil {
+		return info, nil
+	}
+	return fs.Stat(fsys, pathWithoutTopDir(name))
+}
+
+// TopDirReadDir is like TopDirOpen but for ReadDir.
+//
+// EXPERIMENTAL: Subject to change or removal even after stable release.
+func TopDirReadDir(fsys fs.FS, name string) ([]fs.DirEntry, error) {
+	entries, err := fs.ReadDir(fsys, name)
+	if err == nil {
+		return entries, nil
+	}
+	return fs.ReadDir(fsys, pathWithoutTopDir(name))
+}
+
+func pathWithoutTopDir(fpath string) string {
+	slashIdx := strings.Index(fpath, "/")
+	if slashIdx < 0 {
+		return fpath
+	}
+	return fpath[slashIdx+1:]
+}
+
+// dirFile implements the fs.ReadDirFile interface.
+type dirFile struct {
+	info        fs.FileInfo
+	entries     []fs.DirEntry
+	entriesRead int // used for paging with ReadDir(n)
+}
+
+func (dirFile) Read([]byte) (int, error)      { return 0, errors.New("cannot read a directory file") }
+func (df dirFile) Stat() (fs.FileInfo, error) { return df.info, nil }
+func (dirFile) Close() error                  { return nil }
+
+// ReadDir implements [fs.ReadDirFile].
+func (df *dirFile) ReadDir(n int) ([]fs.DirEntry, error) {
+	if n <= 0 {
+		return df.entries, nil
+	}
+	if df.entriesRead >= len(df.entries) {
+		return nil, io.EOF
+	}
+	if df.entriesRead+n > len(df.entries) {
+		n = len(df.entries) - df.entriesRead
+	}
+	entries := df.entries[df.entriesRead : df.entriesRead+n]
+	df.entriesRead += n
+	return entries, nil
+}
+
+// dirFileInfo is an implementation of fs.FileInfo that
+// is only used for files that are directories. It always
+// returns 0 size, directory bit set in the mode, and
+// true for IsDir. It is often used as the FileInfo for
+// dirFile values.
+type dirFileInfo struct {
+	fs.FileInfo
+}
+
+func (dirFileInfo) Size() int64            { return 0 }
+func (info dirFileInfo) Mode() fs.FileMode { return info.FileInfo.Mode() | fs.ModeDir }
+func (dirFileInfo) IsDir() bool            { return true }
+
+// fileInArchive represents a file that is opened from within an archive.
+// It implements fs.File.
+type fileInArchive struct {
+	io.ReadCloser
+	info fs.FileInfo
+}
+
+func (af fileInArchive) Stat() (fs.FileInfo, error) { return af.info, nil }
+
+// closeBoth closes both the file and an associated
+// closer, such as a (de)compressor that wraps the
+// reading/writing of the file. See issue #365. If a
+// better solution is found, I'd probably prefer that.
+type closeBoth struct {
+	fs.File
+	c io.Closer // usually the archive or the decompressor
+}
+
+// Close closes both the file and the associated closer. It always calls
+// Close() on both, but if multiple errors occur they are wrapped together.
+func (dc closeBoth) Close() error {
+	var err error
+	if dc.File != nil {
+		if err2 := dc.File.Close(); err2 != nil {
+			err = fmt.Errorf("closing file: %w", err2)
+		}
+	}
+	if dc.c != nil {
+		if err2 := dc.c.Close(); err2 != nil {
+			if err == nil {
+				err = fmt.Errorf("closing closer: %w", err2)
+			} else {
+				err = fmt.Errorf("%w; additionally, closing closer: %w", err, err2)
+			}
+		}
+	}
+	return err
+}
+
+// implicitDirEntry represents a directory that does
+// not actually exist in the archive but is inferred
+// from the paths of actual files in the archive.
+type implicitDirEntry struct{ name string }
+
+func (e implicitDirEntry) Name() string    { return e.name }
+func (implicitDirEntry) IsDir() bool       { return true }
+func (implicitDirEntry) Type() fs.FileMode { return fs.ModeDir }
+func (e implicitDirEntry) Info() (fs.FileInfo, error) {
+	return implicitDirInfo{e}, nil
+}
+
+// implicitDirInfo is a fs.FileInfo for an implicit directory
+// (implicitDirEntry) value. This is used when an archive may
+// not contain actual entries for a directory, but we need to
+// pretend it exists so its contents can be discovered and
+// traversed.
+type implicitDirInfo struct{ implicitDirEntry }
+
+func (d implicitDirInfo) Name() string      { return d.name }
+func (implicitDirInfo) Size() int64         { return 0 }
+func (d implicitDirInfo) Mode() fs.FileMode { return d.Type() }
+func (implicitDirInfo) ModTime() time.Time  { return time.Time{} }
+func (implicitDirInfo) Sys() any            { return nil }
+
+// dotFileInfo is a fs.FileInfo that can be used to provide
+// the true name instead of ".".
+type dotFileInfo struct {
+	fs.FileInfo
+	name string
+}
+
+func (d dotFileInfo) Name() string { return d.name }
+
+// Interface guards
+var (
+	_ fs.ReadDirFS = (*FileFS)(nil)
+	_ fs.StatFS    = (*FileFS)(nil)
+
+	_ fs.ReadDirFS = (*ArchiveFS)(nil)
+	_ fs.StatFS    = (*ArchiveFS)(nil)
+	_ fs.SubFS     = (*ArchiveFS)(nil)
+)
diff --git a/vendor/github.com/mholt/archives/gz.go b/vendor/github.com/mholt/archives/gz.go
new file mode 100644
index 0000000000..adbf1ed4fc
--- /dev/null
+++ b/vendor/github.com/mholt/archives/gz.go
@@ -0,0 +1,90 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"strings"
+
+	"github.com/klauspost/compress/gzip"
+	"github.com/klauspost/pgzip"
+)
+
+func init() {
+	RegisterFormat(Gz{})
+}
+
+// Gz facilitates gzip compression.
+type Gz struct {
+	// Gzip compression level. See https://pkg.go.dev/compress/flate#pkg-constants
+	// for some predefined constants. If 0, DefaultCompression is assumed rather
+	// than no compression.
+	CompressionLevel int
+
+	// DisableMultistream controls whether the reader supports multistream files.
+	// See https://pkg.go.dev/compress/gzip#example-Reader.Multistream
+	DisableMultistream bool
+
+	// Use a fast parallel Gzip implementation. This is only
+	// effective for large streams (about 1 MB or greater).
+	Multithreaded bool
+}
+
+func (Gz) Extension() string { return ".gz" }
+func (Gz) MediaType() string { return "application/gzip" }
+
+func (gz Gz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), gz.Extension()) {
+		mr.ByName = true
+	}
+
+	// match file header
+	buf, err := readAtMost(stream, len(gzHeader))
+	if err != nil {
+		return mr, err
+	}
+	mr.ByStream = bytes.Equal(buf, gzHeader)
+
+	return mr, nil
+}
+
+func (gz Gz) OpenWriter(w io.Writer) (io.WriteCloser, error) {
+	// assume default compression level if 0, rather than no
+	// compression, since no compression on a gzipped file
+	// doesn't make any sense in our use cases
+	level := gz.CompressionLevel
+	if level == 0 {
+		level = gzip.DefaultCompression
+	}
+
+	var wc io.WriteCloser
+	var err error
+	if gz.Multithreaded {
+		wc, err = pgzip.NewWriterLevel(w, level)
+	} else {
+		wc, err = gzip.NewWriterLevel(w, level)
+	}
+	return wc, err
+}
+
+func (gz Gz) OpenReader(r io.Reader) (io.ReadCloser, error) {
+	if gz.Multithreaded {
+		gzR, err := pgzip.NewReader(r)
+		if gzR != nil && gz.DisableMultistream {
+			gzR.Multistream(false)
+		}
+		return gzR, err
+	}
+
+	gzR, err := gzip.NewReader(r)
+	if gzR != nil && gz.DisableMultistream {
+		gzR.Multistream(false)
+	}
+	return gzR, err
+}
+
+// magic number at the beginning of gzip files
+var gzHeader = []byte{0x1f, 0x8b}
diff --git a/vendor/github.com/mholt/archives/interfaces.go b/vendor/github.com/mholt/archives/interfaces.go
new file mode 100644
index 0000000000..f917ca6004
--- /dev/null
+++ b/vendor/github.com/mholt/archives/interfaces.go
@@ -0,0 +1,116 @@
+package archives
+
+import (
+	"context"
+	"io"
+)
+
+// Format represents a way of getting data out of something else.
+// A format usually represents compression or an archive (or both).
+type Format interface {
+	// Extension returns the conventional file extension for this
+	// format.
+	Extension() string
+
+	// MediaType returns the MIME type ("content type") of this
+	// format (see RFC 2046).
+	MediaType() string
+
+	// Match returns true if the given name/stream is recognized.
+	// One of the arguments is optional: filename might be empty
+	// if working with an unnamed stream, or stream might be empty
+	// if only working with a file on disk; but both may also be
+	// specified. The filename should consist only of the base name,
+	// not path components, and is typically used for matching by
+	// file extension. However, matching by reading the stream is
+	// preferred as it is more accurate. Match reads only as many
+	// bytes as needed to determine a match.
+	Match(ctx context.Context, filename string, stream io.Reader) (MatchResult, error)
+}
+
+// Compression is a compression format with both compress and decompress methods.
+type Compression interface {
+	Format
+	Compressor
+	Decompressor
+}
+
+// Archival is an archival format that can create/write archives.
+type Archival interface {
+	Format
+	Archiver
+	Extractor
+}
+
+// Extraction is an archival format that extract from (read) archives.
+type Extraction interface {
+	Format
+	Extractor
+}
+
+// Compressor can compress data by wrapping a writer.
+type Compressor interface {
+	// OpenWriter wraps w with a new writer that compresses what is written.
+	// The writer must be closed when writing is finished.
+	OpenWriter(w io.Writer) (io.WriteCloser, error)
+}
+
+// Decompressor can decompress data by wrapping a reader.
+type Decompressor interface {
+	// OpenReader wraps r with a new reader that decompresses what is read.
+	// The reader must be closed when reading is finished.
+	OpenReader(r io.Reader) (io.ReadCloser, error)
+}
+
+// Archiver can create a new archive.
+type Archiver interface {
+	// Archive writes an archive file to output with the given files.
+	//
+	// Context cancellation must be honored.
+	Archive(ctx context.Context, output io.Writer, files []FileInfo) error
+}
+
+// ArchiveAsyncJob contains a File to be archived and a channel that
+// the result of the archiving should be returned on.
+// EXPERIMENTAL: Subject to change or removal.
+type ArchiveAsyncJob struct {
+	File   FileInfo
+	Result chan<- error
+}
+
+// ArchiverAsync is an Archiver that can also create archives
+// asynchronously by pumping files into a channel as they are
+// discovered.
+// EXPERIMENTAL: Subject to change or removal.
+type ArchiverAsync interface {
+	Archiver
+
+	// Use ArchiveAsync if you can't pre-assemble a list of all
+	// the files for the archive. Close the jobs channel after
+	// all the files have been sent.
+	//
+	// This won't return until the channel is closed.
+	ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan ArchiveAsyncJob) error
+}
+
+// Extractor can extract files from an archive.
+type Extractor interface {
+	// Extract walks entries in the archive and calls handleFile for each
+	// entry in the archive.
+	//
+	// Any files opened in the FileHandler should be closed when it returns,
+	// as there is no guarantee the files can be read outside the handler
+	// or after the walk has proceeded to the next file.
+	//
+	// Context cancellation must be honored.
+	Extract(ctx context.Context, archive io.Reader, handleFile FileHandler) error
+}
+
+// Inserter can insert files into an existing archive.
+// EXPERIMENTAL: Subject to change.
+type Inserter interface {
+	// Insert inserts the files into archive.
+	//
+	// Context cancellation must be honored.
+	Insert(ctx context.Context, archive io.ReadWriteSeeker, files []FileInfo) error
+}
diff --git a/vendor/github.com/mholt/archives/lz4.go b/vendor/github.com/mholt/archives/lz4.go
new file mode 100644
index 0000000000..39fce3c680
--- /dev/null
+++ b/vendor/github.com/mholt/archives/lz4.go
@@ -0,0 +1,57 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"strings"
+
+	"github.com/pierrec/lz4/v4"
+)
+
+func init() {
+	RegisterFormat(Lz4{})
+}
+
+// Lz4 facilitates LZ4 compression.
+type Lz4 struct {
+	CompressionLevel int
+}
+
+func (Lz4) Extension() string { return ".lz4" }
+func (Lz4) MediaType() string { return "application/x-lz4" }
+
+func (lz Lz4) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), lz.Extension()) {
+		mr.ByName = true
+	}
+
+	// match file header
+	buf, err := readAtMost(stream, len(lz4Header))
+	if err != nil {
+		return mr, err
+	}
+	mr.ByStream = bytes.Equal(buf, lz4Header)
+
+	return mr, nil
+}
+
+func (lz Lz4) OpenWriter(w io.Writer) (io.WriteCloser, error) {
+	lzw := lz4.NewWriter(w)
+	options := []lz4.Option{
+		lz4.CompressionLevelOption(lz4.CompressionLevel(lz.CompressionLevel)),
+	}
+	if err := lzw.Apply(options...); err != nil {
+		return nil, err
+	}
+	return lzw, nil
+}
+
+func (Lz4) OpenReader(r io.Reader) (io.ReadCloser, error) {
+	return io.NopCloser(lz4.NewReader(r)), nil
+}
+
+var lz4Header = []byte{0x04, 0x22, 0x4d, 0x18}
diff --git a/vendor/github.com/mholt/archives/lzip.go b/vendor/github.com/mholt/archives/lzip.go
new file mode 100644
index 0000000000..fa7fdc1b08
--- /dev/null
+++ b/vendor/github.com/mholt/archives/lzip.go
@@ -0,0 +1,55 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"path/filepath"
+	"strings"
+
+	"github.com/sorairolake/lzip-go"
+)
+
+func init() {
+	RegisterFormat(Lzip{})
+}
+
+// Lzip facilitates lzip compression.
+type Lzip struct{}
+
+func (Lzip) Extension() string { return ".lz" }
+func (Lzip) MediaType() string { return "application/x-lzip" }
+
+func (lz Lzip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if filepath.Ext(strings.ToLower(filename)) == lz.Extension() {
+		mr.ByName = true
+	}
+
+	// match file header
+	buf, err := readAtMost(stream, len(lzipHeader))
+	if err != nil {
+		return mr, err
+	}
+	mr.ByStream = bytes.Equal(buf, lzipHeader)
+
+	return mr, nil
+}
+
+func (Lzip) OpenWriter(w io.Writer) (io.WriteCloser, error) {
+	return lzip.NewWriter(w), nil
+}
+
+func (Lzip) OpenReader(r io.Reader) (io.ReadCloser, error) {
+	lzr, err := lzip.NewReader(r)
+	if err != nil {
+		return nil, err
+	}
+	return io.NopCloser(lzr), err
+}
+
+// magic number at the beginning of lzip files
+// https://datatracker.ietf.org/doc/html/draft-diaz-lzip-09#section-2
+var lzipHeader = []byte("LZIP")
diff --git a/vendor/github.com/mholt/archives/minlz.go b/vendor/github.com/mholt/archives/minlz.go
new file mode 100644
index 0000000000..72aede0ec9
--- /dev/null
+++ b/vendor/github.com/mholt/archives/minlz.go
@@ -0,0 +1,53 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"path/filepath"
+	"strings"
+
+	"github.com/minio/minlz"
+)
+
+func init() {
+	RegisterFormat(MinLZ{})
+}
+
+// MinLZ facilitates MinLZ compression. See
+// https://github.com/minio/minlz/blob/main/SPEC.md
+// and
+// https://blog.min.io/minlz-compression-algorithm/.
+type MinLZ struct{}
+
+func (MinLZ) Extension() string { return ".mz" }
+func (MinLZ) MediaType() string { return "application/x-minlz-compressed" }
+
+func (mz MinLZ) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if filepath.Ext(strings.ToLower(filename)) == ".mz" {
+		mr.ByName = true
+	}
+
+	// match file header
+	buf, err := readAtMost(stream, len(mzHeader))
+	if err != nil {
+		return mr, err
+	}
+	mr.ByStream = bytes.Equal(buf, mzHeader)
+
+	return mr, nil
+}
+
+func (MinLZ) OpenWriter(w io.Writer) (io.WriteCloser, error) {
+	return minlz.NewWriter(w), nil
+}
+
+func (MinLZ) OpenReader(r io.Reader) (io.ReadCloser, error) {
+	mr := minlz.NewReader(r)
+	return io.NopCloser(mr), nil
+}
+
+var mzHeader = []byte("\xff\x06\x00\x00MinLz")
diff --git a/vendor/github.com/mholt/archives/rar.go b/vendor/github.com/mholt/archives/rar.go
new file mode 100644
index 0000000000..388ecab18a
--- /dev/null
+++ b/vendor/github.com/mholt/archives/rar.go
@@ -0,0 +1,179 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"log"
+	"os"
+	"path"
+	"strings"
+	"time"
+
+	"github.com/nwaples/rardecode/v2"
+)
+
+func init() {
+	RegisterFormat(Rar{})
+}
+
+type rarReader interface {
+	Next() (*rardecode.FileHeader, error)
+	io.Reader
+	io.WriterTo
+}
+
+type Rar struct {
+	// If true, errors encountered during reading or writing
+	// a file within an archive will be logged and the
+	// operation will continue on remaining files.
+	ContinueOnError bool
+
+	// Password to open archives.
+	Password string
+
+	// Name for a multi-volume archive. When Name is specified,
+	// the named file is extracted (rather than any io.Reader that
+	// may be passed to Extract). If the archive is a multi-volume
+	// archive, this name will also be used by the decoder to derive
+	// the filename of the next volume in the volume set.
+	Name string
+
+	// FS is an fs.FS exposing the files of the archive. Unless Name is
+	// also specified, this does nothing. When Name is also specified,
+	// FS defines the fs.FS that from which the archive will be opened,
+	// and in the case of a multi-volume archive, from where each subsequent
+	// volume of the volume set will be loaded.
+	//
+	// Typically this should be a DirFS pointing at the directory containing
+	// the volumes of the archive.
+	FS fs.FS
+}
+
+func (Rar) Extension() string { return ".rar" }
+func (Rar) MediaType() string { return "application/vnd.rar" }
+
+func (r Rar) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), r.Extension()) {
+		mr.ByName = true
+	}
+
+	// match file header (there are two versions; allocate buffer for larger one)
+	buf, err := readAtMost(stream, len(rarHeaderV5_0))
+	if err != nil {
+		return mr, err
+	}
+
+	matchedV1_5 := len(buf) >= len(rarHeaderV1_5) &&
+		bytes.Equal(rarHeaderV1_5, buf[:len(rarHeaderV1_5)])
+	matchedV5_0 := len(buf) >= len(rarHeaderV5_0) &&
+		bytes.Equal(rarHeaderV5_0, buf[:len(rarHeaderV5_0)])
+
+	mr.ByStream = matchedV1_5 || matchedV5_0
+
+	return mr, nil
+}
+
+// Archive is not implemented for RAR because it is patent-encumbered.
+
+func (r Rar) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error {
+	var options []rardecode.Option
+	if r.Password != "" {
+		options = append(options, rardecode.Password(r.Password))
+	}
+
+	if r.FS != nil {
+		options = append(options, rardecode.FileSystem(r.FS))
+	}
+
+	var (
+		rr  rarReader
+		err error
+	)
+
+	// If a name has been provided, then the sourceArchive stream is ignored
+	// and the archive is opened directly via the filesystem (or provided FS).
+	if r.Name != "" {
+		var or *rardecode.ReadCloser
+		if or, err = rardecode.OpenReader(r.Name, options...); err == nil {
+			rr = or
+			defer or.Close()
+		}
+	} else {
+		rr, err = rardecode.NewReader(sourceArchive, options...)
+	}
+	if err != nil {
+		return err
+	}
+
+	// important to initialize to non-nil, empty value due to how fileIsIncluded works
+	skipDirs := skipList{}
+
+	for {
+		if err := ctx.Err(); err != nil {
+			return err // honor context cancellation
+		}
+
+		hdr, err := rr.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			if r.ContinueOnError {
+				log.Printf("[ERROR] Advancing to next file in rar archive: %v", err)
+				continue
+			}
+			return err
+		}
+		if fileIsIncluded(skipDirs, hdr.Name) {
+			continue
+		}
+
+		info := rarFileInfo{hdr}
+		file := FileInfo{
+			FileInfo:      info,
+			Header:        hdr,
+			NameInArchive: hdr.Name,
+			Open: func() (fs.File, error) {
+				return fileInArchive{io.NopCloser(rr), info}, nil
+			},
+		}
+
+		err = handleFile(ctx, file)
+		if errors.Is(err, fs.SkipAll) {
+			break
+		} else if errors.Is(err, fs.SkipDir) && file.IsDir() {
+			skipDirs.add(hdr.Name)
+		} else if err != nil {
+			return fmt.Errorf("handling file: %s: %w", hdr.Name, err)
+		}
+	}
+
+	return nil
+}
+
+// rarFileInfo satisfies the fs.FileInfo interface for RAR entries.
+type rarFileInfo struct {
+	fh *rardecode.FileHeader
+}
+
+func (rfi rarFileInfo) Name() string       { return path.Base(rfi.fh.Name) }
+func (rfi rarFileInfo) Size() int64        { return rfi.fh.UnPackedSize }
+func (rfi rarFileInfo) Mode() os.FileMode  { return rfi.fh.Mode() }
+func (rfi rarFileInfo) ModTime() time.Time { return rfi.fh.ModificationTime }
+func (rfi rarFileInfo) IsDir() bool        { return rfi.fh.IsDir }
+func (rfi rarFileInfo) Sys() any           { return nil }
+
+var (
+	rarHeaderV1_5 = []byte("Rar!\x1a\x07\x00")     // v1.5
+	rarHeaderV5_0 = []byte("Rar!\x1a\x07\x01\x00") // v5.0
+)
+
+// Interface guard
+var _ Extractor = Rar{}
diff --git a/vendor/github.com/mholt/archives/sz.go b/vendor/github.com/mholt/archives/sz.go
new file mode 100644
index 0000000000..bb23f2109f
--- /dev/null
+++ b/vendor/github.com/mholt/archives/sz.go
@@ -0,0 +1,133 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"strings"
+
+	"github.com/klauspost/compress/s2"
+)
+
+func init() {
+	RegisterFormat(Sz{})
+}
+
+// Sz facilitates Snappy compression. It uses S2
+// for reading and writing, but by default will
+// write Snappy-compatible data.
+type Sz struct {
+	// Configurable S2 extension.
+	S2 S2
+}
+
+// S2 is an extension of Snappy that can read Snappy
+// streams and write Snappy-compatible streams, but
+// can also be configured to write Snappy-incompatible
+// streams for greater gains. See
+// https://pkg.go.dev/github.com/klauspost/compress/s2
+// for details and the documentation for each option.
+type S2 struct {
+	// reader options
+	MaxBlockSize           int
+	AllocBlock             int
+	IgnoreStreamIdentifier bool
+	IgnoreCRC              bool
+
+	// writer options
+	AddIndex           bool
+	Compression        S2Level
+	BlockSize          int
+	Concurrency        int
+	FlushOnWrite       bool
+	Padding            int
+	SnappyIncompatible bool
+}
+
+func (Sz) Extension() string { return ".sz" }
+func (Sz) MediaType() string { return "application/x-snappy-framed" }
+
+func (sz Sz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), sz.Extension()) ||
+		strings.Contains(strings.ToLower(filename), ".s2") {
+		mr.ByName = true
+	}
+
+	// match file header
+	buf, err := readAtMost(stream, len(snappyHeader))
+	if err != nil {
+		return mr, err
+	}
+	mr.ByStream = bytes.Equal(buf, snappyHeader)
+
+	return mr, nil
+}
+
+func (sz Sz) OpenWriter(w io.Writer) (io.WriteCloser, error) {
+	var opts []s2.WriterOption
+	if sz.S2.AddIndex {
+		opts = append(opts, s2.WriterAddIndex())
+	}
+	switch sz.S2.Compression {
+	case S2LevelNone:
+		opts = append(opts, s2.WriterUncompressed())
+	case S2LevelBetter:
+		opts = append(opts, s2.WriterBetterCompression())
+	case S2LevelBest:
+		opts = append(opts, s2.WriterBestCompression())
+	}
+	if sz.S2.BlockSize != 0 {
+		opts = append(opts, s2.WriterBlockSize(sz.S2.BlockSize))
+	}
+	if sz.S2.Concurrency != 0 {
+		opts = append(opts, s2.WriterConcurrency(sz.S2.Concurrency))
+	}
+	if sz.S2.FlushOnWrite {
+		opts = append(opts, s2.WriterFlushOnWrite())
+	}
+	if sz.S2.Padding != 0 {
+		opts = append(opts, s2.WriterPadding(sz.S2.Padding))
+	}
+	if !sz.S2.SnappyIncompatible {
+		// this option is inverted because by default we should
+		// probably write Snappy-compatible streams
+		opts = append(opts, s2.WriterSnappyCompat())
+	}
+	return s2.NewWriter(w, opts...), nil
+}
+
+func (sz Sz) OpenReader(r io.Reader) (io.ReadCloser, error) {
+	var opts []s2.ReaderOption
+	if sz.S2.AllocBlock != 0 {
+		opts = append(opts, s2.ReaderAllocBlock(sz.S2.AllocBlock))
+	}
+	if sz.S2.IgnoreCRC {
+		opts = append(opts, s2.ReaderIgnoreCRC())
+	}
+	if sz.S2.IgnoreStreamIdentifier {
+		opts = append(opts, s2.ReaderIgnoreStreamIdentifier())
+	}
+	if sz.S2.MaxBlockSize != 0 {
+		opts = append(opts, s2.ReaderMaxBlockSize(sz.S2.MaxBlockSize))
+	}
+	return io.NopCloser(s2.NewReader(r, opts...)), nil
+}
+
+// Compression level for S2 (Snappy/Sz extension).
+// EXPERIMENTAL: May be changed or removed without a major version bump.
+type S2Level int
+
+// Compression levels for S2.
+// EXPERIMENTAL: May be changed or removed without a major version bump.
+const (
+	S2LevelNone   S2Level = 0
+	S2LevelFast   S2Level = 1
+	S2LevelBetter S2Level = 2
+	S2LevelBest   S2Level = 3
+)
+
+// https://github.com/google/snappy/blob/master/framing_format.txt - contains "sNaPpY"
+var snappyHeader = []byte{0xff, 0x06, 0x00, 0x00, 0x73, 0x4e, 0x61, 0x50, 0x70, 0x59}
diff --git a/vendor/github.com/mholt/archives/tar.go b/vendor/github.com/mholt/archives/tar.go
new file mode 100644
index 0000000000..96c495a992
--- /dev/null
+++ b/vendor/github.com/mholt/archives/tar.go
@@ -0,0 +1,278 @@
+package archives
+
+import (
+	"archive/tar"
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"log"
+	"strings"
+)
+
+func init() {
+	RegisterFormat(Tar{})
+}
+
+type Tar struct {
+	// If true, use GNU header format
+	FormatGNU bool
+
+	// If true, preserve only numeric user and group id
+	NumericUIDGID bool
+
+	// If true, errors encountered during reading or writing
+	// a file within an archive will be logged and the
+	// operation will continue on remaining files.
+	ContinueOnError bool
+
+	// User ID of the file owner
+	Uid int
+
+	// Group ID of the file owner
+	Gid int
+
+	// Username of the file owner
+	Uname string
+
+	// Group name of the file owner
+	Gname string
+}
+
+func (Tar) Extension() string { return ".tar" }
+func (Tar) MediaType() string { return "application/x-tar" }
+
+func (t Tar) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), t.Extension()) {
+		mr.ByName = true
+	}
+
+	// match file header
+	if stream != nil {
+		r := tar.NewReader(stream)
+		_, err := r.Next()
+		mr.ByStream = err == nil
+	}
+
+	return mr, nil
+}
+
+func (t Tar) Archive(ctx context.Context, output io.Writer, files []FileInfo) error {
+	tw := tar.NewWriter(output)
+	defer tw.Close()
+
+	for _, file := range files {
+		if err := t.writeFileToArchive(ctx, tw, file); err != nil {
+			if t.ContinueOnError && ctx.Err() == nil { // context errors should always abort
+				log.Printf("[ERROR] %v", err)
+				continue
+			}
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (t Tar) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan ArchiveAsyncJob) error {
+	tw := tar.NewWriter(output)
+	defer tw.Close()
+
+	for job := range jobs {
+		job.Result <- t.writeFileToArchive(ctx, tw, job.File)
+	}
+
+	return nil
+}
+
+func (t Tar) writeFileToArchive(ctx context.Context, tw *tar.Writer, file FileInfo) error {
+	if err := ctx.Err(); err != nil {
+		return err // honor context cancellation
+	}
+
+	hdr, err := tar.FileInfoHeader(file, file.LinkTarget)
+	if err != nil {
+		return fmt.Errorf("file %s: creating header: %w", file.NameInArchive, err)
+	}
+	hdr.Name = file.NameInArchive // complete path, since FileInfoHeader() only has base name
+	if hdr.Name == "" {
+		hdr.Name = file.Name() // assume base name of file I guess
+	}
+	if t.FormatGNU {
+		hdr.Format = tar.FormatGNU
+	}
+	if t.NumericUIDGID {
+		hdr.Uname = ""
+		hdr.Gname = ""
+	}
+	if t.Uid != 0 {
+		hdr.Uid = t.Uid
+	}
+	if t.Gid != 0 {
+		hdr.Gid = t.Gid
+	}
+	if t.Uname != "" {
+		hdr.Uname = t.Uname
+	}
+	if t.Gname != "" {
+		hdr.Gname = t.Gname
+	}
+
+	if err := tw.WriteHeader(hdr); err != nil {
+		return fmt.Errorf("file %s: writing header: %w", file.NameInArchive, err)
+	}
+
+	// only proceed to write a file body if there is actually a body
+	// (for example, directories and links don't have a body)
+	if hdr.Typeflag != tar.TypeReg {
+		return nil
+	}
+
+	if err := openAndCopyFile(file, tw); err != nil {
+		return fmt.Errorf("file %s: writing data: %w", file.NameInArchive, err)
+	}
+
+	return nil
+}
+
+func (t Tar) Insert(ctx context.Context, into io.ReadWriteSeeker, files []FileInfo) error {
+	// Tar files may end with some, none, or a lot of zero-byte padding. The spec says
+	// it should end with two 512-byte trailer records consisting solely of null/0
+	// bytes: https://www.gnu.org/software/tar/manual/html_node/Standard.html. However,
+	// in my experiments using the `tar` command, I've found that is not the case,
+	// and Colin Percival (author of tarsnap) confirmed this:
+	// - https://twitter.com/cperciva/status/1476774314623913987
+	// - https://twitter.com/cperciva/status/1476776999758663680
+	// So while this solution on Stack Overflow makes sense if you control the
+	// writer: https://stackoverflow.com/a/18330903/1048862 - and I did get it
+	// to work in that case -- it is not a general solution. Seems that the only
+	// reliable thing to do is scan the entire archive to find the last file,
+	// read its size, then use that to compute the end of content and thus the
+	// true length of end-of-archive padding. This is slightly more complex than
+	// just adding the size of the last file to the current stream/seek position,
+	// because we have to align to 512-byte blocks precisely. I don't actually
+	// fully know why this works, but in my testing on a few different files it
+	// did work, whereas other solutions only worked on 1 specific file. *shrug*
+	//
+	// Another option is to scan the file for the last contiguous series of 0s,
+	// without interpreting the tar format at all, and to find the nearest
+	// blocksize-offset and start writing there. Problem is that you wouldn't
+	// know if you just overwrote some of the last file if it ends with all 0s.
+	// Sigh.
+	var lastFileSize, lastStreamPos int64
+	tr := tar.NewReader(into)
+	for {
+		hdr, err := tr.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+		lastStreamPos, err = into.Seek(0, io.SeekCurrent)
+		if err != nil {
+			return err
+		}
+		lastFileSize = hdr.Size
+	}
+
+	// we can now compute the precise location to write the new file to (I think)
+	const blockSize = 512 // (as of Go 1.17, this is also a hard-coded const in the archive/tar package)
+	newOffset := lastStreamPos + lastFileSize
+	newOffset += blockSize - (newOffset % blockSize) // shift to next-nearest block boundary
+	_, err := into.Seek(newOffset, io.SeekStart)
+	if err != nil {
+		return err
+	}
+
+	tw := tar.NewWriter(into)
+	defer tw.Close()
+
+	for i, file := range files {
+		if err := ctx.Err(); err != nil {
+			return err // honor context cancellation
+		}
+		err = t.writeFileToArchive(ctx, tw, file)
+		if err != nil {
+			if t.ContinueOnError && ctx.Err() == nil {
+				log.Printf("[ERROR] appending file %d into archive: %s: %v", i, file.Name(), err)
+				continue
+			}
+			return fmt.Errorf("appending file %d into archive: %s: %w", i, file.Name(), err)
+		}
+	}
+
+	return nil
+}
+
+func (t Tar) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error {
+	tr := tar.NewReader(sourceArchive)
+
+	// important to initialize to non-nil, empty value due to how fileIsIncluded works
+	skipDirs := skipList{}
+
+	for {
+		if err := ctx.Err(); err != nil {
+			return err // honor context cancellation
+		}
+
+		hdr, err := tr.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			if t.ContinueOnError && ctx.Err() == nil {
+				log.Printf("[ERROR] Advancing to next file in tar archive: %v", err)
+				continue
+			}
+			return err
+		}
+		if fileIsIncluded(skipDirs, hdr.Name) {
+			continue
+		}
+		if hdr.Typeflag == tar.TypeXGlobalHeader {
+			// ignore the pax global header from git-generated tarballs
+			continue
+		}
+
+		info := hdr.FileInfo()
+		file := FileInfo{
+			FileInfo:      info,
+			Header:        hdr,
+			NameInArchive: hdr.Name,
+			LinkTarget:    hdr.Linkname,
+			Open: func() (fs.File, error) {
+				return fileInArchive{io.NopCloser(tr), info}, nil
+			},
+		}
+
+		err = handleFile(ctx, file)
+		if errors.Is(err, fs.SkipAll) {
+			// At first, I wasn't sure if fs.SkipAll implied that the rest of the entries
+			// should still be iterated and just "skipped" (i.e. no-ops) or if the walk
+			// should stop; both have the same net effect, one is just less efficient...
+			// apparently the name of fs.StopWalk was the preferred name, but it still
+			// became fs.SkipAll because of semantics with documentation; see
+			// https://github.com/golang/go/issues/47209 -- anyway, the walk should stop.
+			break
+		} else if errors.Is(err, fs.SkipDir) && file.IsDir() {
+			skipDirs.add(hdr.Name)
+		} else if err != nil {
+			return fmt.Errorf("handling file: %s: %w", hdr.Name, err)
+		}
+	}
+
+	return nil
+}
+
+// Interface guards
+var (
+	_ Archiver      = (*Tar)(nil)
+	_ ArchiverAsync = (*Tar)(nil)
+	_ Extractor     = (*Tar)(nil)
+	_ Inserter      = (*Tar)(nil)
+)
diff --git a/vendor/github.com/mholt/archives/xz.go b/vendor/github.com/mholt/archives/xz.go
new file mode 100644
index 0000000000..e213bae788
--- /dev/null
+++ b/vendor/github.com/mholt/archives/xz.go
@@ -0,0 +1,55 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"strings"
+
+	fastxz "github.com/mikelolasagasti/xz"
+	"github.com/ulikunitz/xz"
+)
+
+func init() {
+	RegisterFormat(Xz{})
+}
+
+// Xz facilitates xz compression.
+type Xz struct{}
+
+func (Xz) Extension() string { return ".xz" }
+func (Xz) MediaType() string { return "application/x-xz" }
+
+func (x Xz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), x.Extension()) {
+		mr.ByName = true
+	}
+
+	// match file header
+	buf, err := readAtMost(stream, len(xzHeader))
+	if err != nil {
+		return mr, err
+	}
+	mr.ByStream = bytes.Equal(buf, xzHeader)
+
+	return mr, nil
+}
+
+func (Xz) OpenWriter(w io.Writer) (io.WriteCloser, error) {
+	return xz.NewWriter(w)
+}
+
+func (Xz) OpenReader(r io.Reader) (io.ReadCloser, error) {
+	xr, err := fastxz.NewReader(r, 0)
+	if err != nil {
+		return nil, err
+	}
+	return io.NopCloser(xr), err
+}
+
+// magic number at the beginning of xz files; see section 2.1.1.1
+// of https://tukaani.org/xz/xz-file-format.txt
+var xzHeader = []byte{0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00}
diff --git a/vendor/github.com/mholt/archives/zip.go b/vendor/github.com/mholt/archives/zip.go
new file mode 100644
index 0000000000..be0b7bc88f
--- /dev/null
+++ b/vendor/github.com/mholt/archives/zip.go
@@ -0,0 +1,436 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"log"
+	"os"
+	"path"
+	"strings"
+
+	szip "github.com/STARRY-S/zip"
+	"golang.org/x/text/encoding"
+
+	"github.com/dsnet/compress/bzip2"
+	"github.com/klauspost/compress/zip"
+	"github.com/klauspost/compress/zstd"
+	"github.com/ulikunitz/xz"
+)
+
+func init() {
+	RegisterFormat(Zip{})
+
+	// TODO: What about custom flate levels too
+	zip.RegisterCompressor(ZipMethodBzip2, func(out io.Writer) (io.WriteCloser, error) {
+		return bzip2.NewWriter(out, &bzip2.WriterConfig{ /*TODO: Level: z.CompressionLevel*/ })
+	})
+	zip.RegisterCompressor(ZipMethodZstd, func(out io.Writer) (io.WriteCloser, error) {
+		return zstd.NewWriter(out)
+	})
+	zip.RegisterCompressor(ZipMethodXz, func(out io.Writer) (io.WriteCloser, error) {
+		return xz.NewWriter(out)
+	})
+
+	zip.RegisterDecompressor(ZipMethodBzip2, func(r io.Reader) io.ReadCloser {
+		bz2r, err := bzip2.NewReader(r, nil)
+		if err != nil {
+			return nil
+		}
+		return bz2r
+	})
+	zip.RegisterDecompressor(ZipMethodZstd, func(r io.Reader) io.ReadCloser {
+		zr, err := zstd.NewReader(r)
+		if err != nil {
+			return nil
+		}
+		return zr.IOReadCloser()
+	})
+	zip.RegisterDecompressor(ZipMethodXz, func(r io.Reader) io.ReadCloser {
+		xr, err := xz.NewReader(r)
+		if err != nil {
+			return nil
+		}
+		return io.NopCloser(xr)
+	})
+}
+
+type Zip struct {
+	// Only compress files which are not already in a
+	// compressed format (determined simply by examining
+	// file extension).
+	SelectiveCompression bool
+
+	// The method or algorithm for compressing stored files.
+	Compression uint16
+
+	// If true, errors encountered during reading or writing
+	// a file within an archive will be logged and the
+	// operation will continue on remaining files.
+	ContinueOnError bool
+
+	// For files in zip archives that do not have UTF-8
+	// encoded filenames and comments, specify the character
+	// encoding here.
+	TextEncoding encoding.Encoding
+}
+
+func (Zip) Extension() string { return ".zip" }
+func (Zip) MediaType() string { return "application/zip" }
+
+func (z Zip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), z.Extension()) {
+		mr.ByName = true
+	}
+
+	// match file header
+	for _, hdr := range zipHeaders {
+		buf, err := readAtMost(stream, len(hdr))
+		if err != nil {
+			return mr, err
+		}
+		if bytes.Equal(buf, hdr) {
+			mr.ByStream = true
+			break
+		}
+	}
+
+	return mr, nil
+}
+
+func (z Zip) Archive(ctx context.Context, output io.Writer, files []FileInfo) error {
+	zw := zip.NewWriter(output)
+	defer zw.Close()
+
+	for i, file := range files {
+		if err := z.archiveOneFile(ctx, zw, i, file); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (z Zip) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan ArchiveAsyncJob) error {
+	zw := zip.NewWriter(output)
+	defer zw.Close()
+
+	var i int
+	for job := range jobs {
+		job.Result <- z.archiveOneFile(ctx, zw, i, job.File)
+		i++
+	}
+
+	return nil
+}
+
+func (z Zip) archiveOneFile(ctx context.Context, zw *zip.Writer, idx int, file FileInfo) error {
+	if err := ctx.Err(); err != nil {
+		return err // honor context cancellation
+	}
+
+	hdr, err := zip.FileInfoHeader(file)
+	if err != nil {
+		return fmt.Errorf("getting info for file %d: %s: %w", idx, file.Name(), err)
+	}
+	hdr.Name = file.NameInArchive // complete path, since FileInfoHeader() only has base name
+	if hdr.Name == "" {
+		hdr.Name = file.Name() // assume base name of file I guess
+	}
+
+	// customize header based on file properties
+	if file.IsDir() {
+		if !strings.HasSuffix(hdr.Name, "/") {
+			hdr.Name += "/" // required
+		}
+		hdr.Method = zip.Store
+	} else if z.SelectiveCompression {
+		// only enable compression on compressable files
+		ext := strings.ToLower(path.Ext(hdr.Name))
+		if _, ok := compressedFormats[ext]; ok {
+			hdr.Method = zip.Store
+		} else {
+			hdr.Method = z.Compression
+		}
+	} else {
+		hdr.Method = z.Compression
+	}
+
+	w, err := zw.CreateHeader(hdr)
+	if err != nil {
+		return fmt.Errorf("creating header for file %d: %s: %w", idx, file.Name(), err)
+	}
+
+	// file won't be considered a symlink if FollowSymlinks in FilesFromDisk is true
+	if isSymlink(file) {
+		_, err := w.Write([]byte(file.LinkTarget))
+		if err != nil {
+			return fmt.Errorf("writing link target for file %d: %s: %w", idx, file.Name(), err)
+		}
+		return nil
+	}
+
+	// directories have no file body
+	if file.IsDir() {
+		return nil
+	}
+
+	if err := openAndCopyFile(file, w); err != nil {
+		return fmt.Errorf("writing file %d: %s: %w", idx, file.Name(), err)
+	}
+
+	return nil
+}
+
+// Extract extracts files from z, implementing the Extractor interface. Uniquely, however,
+// sourceArchive must be an io.ReaderAt and io.Seeker, which are oddly disjoint interfaces
+// from io.Reader which is what the method signature requires. We chose this signature for
+// the interface because we figure you can Read() from anything you can ReadAt() or Seek()
+// with. Due to the nature of the zip archive format, if sourceArchive is not an io.Seeker
+// and io.ReaderAt, an error is returned.
+func (z Zip) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error {
+	sra, ok := sourceArchive.(seekReaderAt)
+	if !ok {
+		return fmt.Errorf("input type must be an io.ReaderAt and io.Seeker because of zip format constraints")
+	}
+
+	size, err := streamSizeBySeeking(sra)
+	if err != nil {
+		return fmt.Errorf("determining stream size: %w", err)
+	}
+
+	zr, err := zip.NewReader(sra, size)
+	if err != nil {
+		return err
+	}
+
+	// important to initialize to non-nil, empty value due to how fileIsIncluded works
+	skipDirs := skipList{}
+
+	for i, f := range zr.File {
+		if err := ctx.Err(); err != nil {
+			return err // honor context cancellation
+		}
+
+		// ensure filename and comment are UTF-8 encoded (issue #147 and PR #305)
+		z.decodeText(&f.FileHeader)
+
+		if fileIsIncluded(skipDirs, f.Name) {
+			continue
+		}
+
+		info := f.FileInfo()
+		linkTarget, err := z.getLinkTarget(f)
+		if err != nil {
+			return fmt.Errorf("getting link target for file %d: %s: %w", i, f.Name, err)
+		}
+
+		file := FileInfo{
+			FileInfo:      info,
+			Header:        f.FileHeader,
+			NameInArchive: f.Name,
+			LinkTarget:    linkTarget,
+			Open: func() (fs.File, error) {
+				openedFile, err := f.Open()
+				if err != nil {
+					return nil, err
+				}
+				return fileInArchive{openedFile, info}, nil
+			},
+		}
+
+		err = handleFile(ctx, file)
+		if errors.Is(err, fs.SkipAll) {
+			break
+		} else if errors.Is(err, fs.SkipDir) && file.IsDir() {
+			skipDirs.add(f.Name)
+		} else if err != nil {
+			if z.ContinueOnError {
+				log.Printf("[ERROR] %s: %v", f.Name, err)
+				continue
+			}
+			return fmt.Errorf("handling file %d: %s: %w", i, f.Name, err)
+		}
+	}
+
+	return nil
+}
+
+// decodeText decodes the name and comment fields from hdr into UTF-8.
+// It is a no-op if the text is already UTF-8 encoded or if z.TextEncoding
+// is not specified.
+func (z Zip) decodeText(hdr *zip.FileHeader) {
+	if hdr.NonUTF8 && z.TextEncoding != nil {
+		dec := z.TextEncoding.NewDecoder()
+		filename, err := dec.String(hdr.Name)
+		if err == nil {
+			hdr.Name = filename
+		}
+		if hdr.Comment != "" {
+			comment, err := dec.String(hdr.Comment)
+			if err == nil {
+				hdr.Comment = comment
+			}
+		}
+	}
+}
+
+func (z Zip) getLinkTarget(f *zip.File) (string, error) {
+	info := f.FileInfo()
+	// Exit early if not a symlink
+	if info.Mode()&os.ModeSymlink == 0 {
+		return "", nil
+	}
+
+	// Open the file and read the link target
+	file, err := f.Open()
+	if err != nil {
+		return "", err
+	}
+	defer file.Close()
+
+	const maxLinkTargetSize = 32768
+	linkTargetBytes, err := io.ReadAll(io.LimitReader(file, maxLinkTargetSize))
+	if err != nil {
+		return "", err
+	}
+
+	if len(linkTargetBytes) == maxLinkTargetSize {
+		return "", fmt.Errorf("link target is too large: %d bytes", len(linkTargetBytes))
+	}
+
+	return string(linkTargetBytes), nil
+}
+
+// Insert appends the listed files into the provided Zip archive stream.
+// If the filename already exists in the archive, it will be replaced.
+func (z Zip) Insert(ctx context.Context, into io.ReadWriteSeeker, files []FileInfo) error {
+	// following very simple example at https://github.com/STARRY-S/zip?tab=readme-ov-file#usage
+	zu, err := szip.NewUpdater(into)
+	if err != nil {
+		return err
+	}
+	defer zu.Close()
+
+	for idx, file := range files {
+		if err := ctx.Err(); err != nil {
+			return err // honor context cancellation
+		}
+
+		hdr, err := szip.FileInfoHeader(file)
+		if err != nil {
+			return fmt.Errorf("getting info for file %d: %s: %w", idx, file.NameInArchive, err)
+		}
+		hdr.Name = file.NameInArchive // complete path, since FileInfoHeader() only has base name
+		if hdr.Name == "" {
+			hdr.Name = file.Name() // assume base name of file I guess
+		}
+
+		// customize header based on file properties
+		if file.IsDir() {
+			if !strings.HasSuffix(hdr.Name, "/") {
+				hdr.Name += "/" // required
+			}
+			hdr.Method = zip.Store
+		} else if z.SelectiveCompression {
+			// only enable compression on compressable files
+			ext := strings.ToLower(path.Ext(hdr.Name))
+			if _, ok := compressedFormats[ext]; ok {
+				hdr.Method = zip.Store
+			} else {
+				hdr.Method = z.Compression
+			}
+		}
+
+		w, err := zu.AppendHeader(hdr, szip.APPEND_MODE_OVERWRITE)
+		if err != nil {
+			return fmt.Errorf("inserting file header: %d: %s: %w", idx, file.Name(), err)
+		}
+
+		// directories have no file body
+		if file.IsDir() {
+			return nil
+		}
+		if err := openAndCopyFile(file, w); err != nil {
+			if z.ContinueOnError && ctx.Err() == nil {
+				log.Printf("[ERROR] appending file %d into archive: %s: %v", idx, file.Name(), err)
+				continue
+			}
+			return fmt.Errorf("copying inserted file %d: %s: %w", idx, file.Name(), err)
+		}
+	}
+
+	return nil
+}
+
+type seekReaderAt interface {
+	io.ReaderAt
+	io.Seeker
+}
+
+// Additional compression methods not offered by archive/zip.
+// See https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT section 4.4.5.
+const (
+	ZipMethodBzip2 = 12
+	// TODO: LZMA: Disabled - because 7z isn't able to unpack ZIP+LZMA ZIP+LZMA2 archives made this way - and vice versa.
+	// ZipMethodLzma     = 14
+	ZipMethodZstd = 93
+	ZipMethodXz   = 95
+)
+
+// compressedFormats is a (non-exhaustive) set of lowercased
+// file extensions for formats that are typically already
+// compressed. Compressing files that are already compressed
+// is inefficient, so use this set of extensions to avoid that.
+var compressedFormats = map[string]struct{}{
+	".7z":   {},
+	".avi":  {},
+	".br":   {},
+	".bz2":  {},
+	".cab":  {},
+	".docx": {},
+	".gif":  {},
+	".gz":   {},
+	".jar":  {},
+	".jpeg": {},
+	".jpg":  {},
+	".lz":   {},
+	".lz4":  {},
+	".lzma": {},
+	".m4v":  {},
+	".mov":  {},
+	".mp3":  {},
+	".mp4":  {},
+	".mpeg": {},
+	".mpg":  {},
+	".png":  {},
+	".pptx": {},
+	".rar":  {},
+	".sz":   {},
+	".tbz2": {},
+	".tgz":  {},
+	".tsz":  {},
+	".txz":  {},
+	".xlsx": {},
+	".xz":   {},
+	".zip":  {},
+	".zipx": {},
+}
+
+var zipHeaders = [][]byte{
+	[]byte("PK\x03\x04"), // normal
+	[]byte("PK\x05\x06"), // empty
+}
+
+// Interface guards
+var (
+	_ Archiver      = Zip{}
+	_ ArchiverAsync = Zip{}
+	_ Extractor     = Zip{}
+)
diff --git a/vendor/github.com/mholt/archives/zlib.go b/vendor/github.com/mholt/archives/zlib.go
new file mode 100644
index 0000000000..9ee64f4752
--- /dev/null
+++ b/vendor/github.com/mholt/archives/zlib.go
@@ -0,0 +1,74 @@
+package archives
+
+import (
+	"context"
+	"io"
+	"strings"
+
+	"github.com/klauspost/compress/zlib"
+)
+
+func init() {
+	RegisterFormat(Zlib{})
+}
+
+// Zlib facilitates zlib compression.
+type Zlib struct {
+	CompressionLevel int
+}
+
+func (Zlib) Extension() string { return ".zz" }
+func (Zlib) MediaType() string { return "application/zlib" }
+
+func (zz Zlib) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), zz.Extension()) {
+		mr.ByName = true
+	}
+
+	// match file header
+	buf, err := readAtMost(stream, 2)
+	// If an error occurred or buf is not 2 bytes we can't check the header
+	if err != nil || len(buf) < 2 {
+		return mr, err
+	}
+
+	mr.ByStream = isValidZlibHeader(buf[0], buf[1])
+
+	return mr, nil
+}
+
+func (zz Zlib) OpenWriter(w io.Writer) (io.WriteCloser, error) {
+	level := zz.CompressionLevel
+	if level == 0 {
+		level = zlib.DefaultCompression
+	}
+	return zlib.NewWriterLevel(w, level)
+}
+
+func (Zlib) OpenReader(r io.Reader) (io.ReadCloser, error) {
+	return zlib.NewReader(r)
+}
+
+func isValidZlibHeader(first, second byte) bool {
+	// Define all 32 valid zlib headers, see https://stackoverflow.com/questions/9050260/what-does-a-zlib-header-look-like/54915442#54915442
+	validHeaders := map[uint16]struct{}{
+		0x081D: {}, 0x085B: {}, 0x0899: {}, 0x08D7: {},
+		0x1819: {}, 0x1857: {}, 0x1895: {}, 0x18D3: {},
+		0x2815: {}, 0x2853: {}, 0x2891: {}, 0x28CF: {},
+		0x3811: {}, 0x384F: {}, 0x388D: {}, 0x38CB: {},
+		0x480D: {}, 0x484B: {}, 0x4889: {}, 0x48C7: {},
+		0x5809: {}, 0x5847: {}, 0x5885: {}, 0x58C3: {},
+		0x6805: {}, 0x6843: {}, 0x6881: {}, 0x68DE: {},
+		0x7801: {}, 0x785E: {}, 0x789C: {}, 0x78DA: {},
+	}
+
+	// Combine the first and second bytes into a single 16-bit, big-endian value
+	header := uint16(first)<<8 | uint16(second)
+
+	// Check if the header is in the map of valid headers
+	_, isValid := validHeaders[header]
+	return isValid
+}
diff --git a/vendor/github.com/mholt/archives/zstd.go b/vendor/github.com/mholt/archives/zstd.go
new file mode 100644
index 0000000000..c36c6b913e
--- /dev/null
+++ b/vendor/github.com/mholt/archives/zstd.go
@@ -0,0 +1,66 @@
+package archives
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"strings"
+
+	"github.com/klauspost/compress/zstd"
+)
+
+func init() {
+	RegisterFormat(Zstd{})
+}
+
+// Zstd facilitates Zstandard compression.
+type Zstd struct {
+	EncoderOptions []zstd.EOption
+	DecoderOptions []zstd.DOption
+}
+
+func (Zstd) Extension() string { return ".zst" }
+func (Zstd) MediaType() string { return "application/zstd" }
+
+func (zs Zstd) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
+	var mr MatchResult
+
+	// match filename
+	if strings.Contains(strings.ToLower(filename), zs.Extension()) {
+		mr.ByName = true
+	}
+
+	// match file header
+	buf, err := readAtMost(stream, len(zstdHeader))
+	if err != nil {
+		return mr, err
+	}
+	mr.ByStream = bytes.Equal(buf, zstdHeader)
+
+	return mr, nil
+}
+
+func (zs Zstd) OpenWriter(w io.Writer) (io.WriteCloser, error) {
+	return zstd.NewWriter(w, zs.EncoderOptions...)
+}
+
+func (zs Zstd) OpenReader(r io.Reader) (io.ReadCloser, error) {
+	zr, err := zstd.NewReader(r, zs.DecoderOptions...)
+	if err != nil {
+		return nil, err
+	}
+	return errorCloser{zr}, nil
+}
+
+type errorCloser struct {
+	*zstd.Decoder
+}
+
+func (ec errorCloser) Close() error {
+	ec.Decoder.Close()
+	return nil
+}
+
+// magic number at the beginning of Zstandard files
+// https://github.com/facebook/zstd/blob/6211bfee5ec24dc825c11751c33aa31d618b5f10/doc/zstd_compression_format.md
+var zstdHeader = []byte{0x28, 0xb5, 0x2f, 0xfd}
diff --git a/vendor/github.com/xi2/xz/AUTHORS b/vendor/github.com/mikelolasagasti/xz/AUTHORS
similarity index 100%
rename from vendor/github.com/xi2/xz/AUTHORS
rename to vendor/github.com/mikelolasagasti/xz/AUTHORS
diff --git a/vendor/github.com/mikelolasagasti/xz/LICENSE b/vendor/github.com/mikelolasagasti/xz/LICENSE
new file mode 100644
index 0000000000..3e77abfbd0
--- /dev/null
+++ b/vendor/github.com/mikelolasagasti/xz/LICENSE
@@ -0,0 +1,12 @@
+Copyright (C) 2015-2017 Michael Cross <https://github.com/xi2>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
diff --git a/vendor/github.com/mikelolasagasti/xz/README.md b/vendor/github.com/mikelolasagasti/xz/README.md
new file mode 100644
index 0000000000..625ef07c41
--- /dev/null
+++ b/vendor/github.com/mikelolasagasti/xz/README.md
@@ -0,0 +1,23 @@
+# Xz
+
+Package xz implements XZ decompression natively in Go.
+
+Documentation at <https://pkg.go.dev/github.com/mikelolasagasti/xz>.
+
+Download and install with `go get github.com/mikelolasagasti/xz`.
+
+If you need compression as well as decompression, you might want to
+look at <https://github.com/ulikunitz/xz>.
+
+# LICENSE
+
+This was originally released into the public domain by the AUTHORS.
+Here it is licensed more explicitly as Zero-Clause BSD (0BSD) so that it can be
+detected by automated tooling, and satisfy the legal requirements for vendor
+integration for cases in which a "public domain" statement is not sufficient.
+
+This Go package is a modified version of
+
+     XZ Embedded  <http://tukaani.org/xz/embedded.html>
+
+and the project adopted 0BSD for newer commits.
diff --git a/vendor/github.com/xi2/xz/dec_bcj.go b/vendor/github.com/mikelolasagasti/xz/dec_bcj.go
similarity index 100%
rename from vendor/github.com/xi2/xz/dec_bcj.go
rename to vendor/github.com/mikelolasagasti/xz/dec_bcj.go
diff --git a/vendor/github.com/xi2/xz/dec_delta.go b/vendor/github.com/mikelolasagasti/xz/dec_delta.go
similarity index 100%
rename from vendor/github.com/xi2/xz/dec_delta.go
rename to vendor/github.com/mikelolasagasti/xz/dec_delta.go
diff --git a/vendor/github.com/xi2/xz/dec_lzma2.go b/vendor/github.com/mikelolasagasti/xz/dec_lzma2.go
similarity index 100%
rename from vendor/github.com/xi2/xz/dec_lzma2.go
rename to vendor/github.com/mikelolasagasti/xz/dec_lzma2.go
diff --git a/vendor/github.com/xi2/xz/dec_stream.go b/vendor/github.com/mikelolasagasti/xz/dec_stream.go
similarity index 100%
rename from vendor/github.com/xi2/xz/dec_stream.go
rename to vendor/github.com/mikelolasagasti/xz/dec_stream.go
diff --git a/vendor/github.com/xi2/xz/dec_util.go b/vendor/github.com/mikelolasagasti/xz/dec_util.go
similarity index 100%
rename from vendor/github.com/xi2/xz/dec_util.go
rename to vendor/github.com/mikelolasagasti/xz/dec_util.go
diff --git a/vendor/github.com/xi2/xz/dec_xz.go b/vendor/github.com/mikelolasagasti/xz/dec_xz.go
similarity index 100%
rename from vendor/github.com/xi2/xz/dec_xz.go
rename to vendor/github.com/mikelolasagasti/xz/dec_xz.go
diff --git a/vendor/github.com/xi2/xz/doc.go b/vendor/github.com/mikelolasagasti/xz/doc.go
similarity index 100%
rename from vendor/github.com/xi2/xz/doc.go
rename to vendor/github.com/mikelolasagasti/xz/doc.go
diff --git a/vendor/github.com/xi2/xz/reader.go b/vendor/github.com/mikelolasagasti/xz/reader.go
similarity index 100%
rename from vendor/github.com/xi2/xz/reader.go
rename to vendor/github.com/mikelolasagasti/xz/reader.go
diff --git a/vendor/github.com/minio/minlz/.gitattributes b/vendor/github.com/minio/minlz/.gitattributes
new file mode 100644
index 0000000000..402433593c
--- /dev/null
+++ b/vendor/github.com/minio/minlz/.gitattributes
@@ -0,0 +1,2 @@
+* -text
+*.bin -text -diff
diff --git a/vendor/github.com/minio/minlz/.gitignore b/vendor/github.com/minio/minlz/.gitignore
new file mode 100644
index 0000000000..a09c56df5c
--- /dev/null
+++ b/vendor/github.com/minio/minlz/.gitignore
@@ -0,0 +1 @@
+/.idea
diff --git a/vendor/github.com/minio/minlz/.goreleaser.yaml b/vendor/github.com/minio/minlz/.goreleaser.yaml
new file mode 100644
index 0000000000..79c6de8329
--- /dev/null
+++ b/vendor/github.com/minio/minlz/.goreleaser.yaml
@@ -0,0 +1,68 @@
+version: 2
+
+builds:
+  -
+    id: "mz"
+    binary: mz
+    main: ./cmd/mz
+    flags:
+      - -trimpath
+    env:
+      - CGO_ENABLED=0
+    goos:
+      - aix
+      - linux
+      - freebsd
+      - netbsd
+      - windows
+      - openbsd
+      - darwin
+    goarch:
+      - 386
+      - amd64
+      - arm
+      - arm64
+      - ppc64
+      - ppc64le
+      - mips64
+      - mips64le
+      - s390x
+      - riscv64
+    goarm:
+      - 7
+
+archives:
+  -
+    id: minlz-binaries
+    name_template: "minlz-{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
+    format_overrides:
+      - goos: windows
+        format: zip
+    files:
+      - README.md
+      - LICENSE
+checksum:
+  name_template: 'checksums.txt'
+snapshot:
+  version_template: "{{ .Tag }}-next"
+changelog:
+  sort: asc
+  filters:
+    exclude:
+    - '^doc:'
+    - '^docs:'
+    - '^test:'
+    - '^tests:'
+    - '^Update\sREADME.md'
+
+nfpms:
+  -
+    file_name_template: "minlz_package__{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
+    vendor: MinIO Inc
+    homepage: https://github.com/minio/minlz
+    maintainer: MinIO Inc <dev@min.io>
+    description: MinLZ Compression Tool
+    license: Apache 2.0
+    formats:
+      - deb
+      - rpm
diff --git a/vendor/github.com/minio/minlz/LICENSE b/vendor/github.com/minio/minlz/LICENSE
new file mode 100644
index 0000000000..a1482d99e9
--- /dev/null
+++ b/vendor/github.com/minio/minlz/LICENSE
@@ -0,0 +1,177 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
\ No newline at end of file
diff --git a/vendor/github.com/minio/minlz/README.md b/vendor/github.com/minio/minlz/README.md
new file mode 100644
index 0000000000..0b829c441e
--- /dev/null
+++ b/vendor/github.com/minio/minlz/README.md
@@ -0,0 +1,821 @@
+# MinLZ
+
+MinLZ is a LZ77-type compressor with a fixed byte-aligned encoding, in the similar class to Snappy and LZ4.
+
+The goal of MinLZ is to provide a fast, low memory compression algorithm that can be used for fast compression of data, 
+where encoding and/or decoding speed is the primary concern. 
+
+MinLZ is designed to operate *faster than IO* for both compression and decompression and be a viable "always on"
+option even if some content already is compressed.
+If slow compression is acceptable, MinLZ can be configured to produce high compression ratio, 
+but retain high decompression speed.
+
+* Best in class compression
+* Block or Streaming interfaces
+* Very fast decompression, even as pure Go
+* AMD64 encoder+decoder assembly
+* Adjustable Compression (3 levels)
+* Concurrent stream Compression
+* Concurrent stream Decompression
+* Skip forward in compressed stream via independent blocks
+* Random seeking with optional indexes
+* Stream EOF validation
+* Automatic stream size padding
+* Custom encoders for small blocks
+* Skippable/Non-skippable user blocks
+* Detailed control of memory under decompression
+* Fast detection of pre-compressed data
+* Powerful commandline utility
+
+This package implements the MinLZ specification v1.0 in Go.
+
+For format specification see the included [SPEC.md](SPEC.md).
+
+# Changelog
+
+* [v1.0.0](https://github.com/minio/minlz/releases/tag/v1.0.0)
+  * [Initial Release Blog Post](https://blog.min.io/minlz-compression-algorithm/).
+
+# Usage
+
+[![Go Reference](https://pkg.go.dev/badge/minio/minlz.svg)](https://pkg.go.dev/github.com/minio/minlz?tab=subdirectories)
+[![Go](https://github.com/minio/minlz/actions/workflows/go.yml/badge.svg)](https://github.com/minio/minlz/actions/workflows/go.yml)
+
+MinLZ can operate on *blocks* up to 8 MB or *streams* with unlimited length.
+
+Blocks are the simplest, but do not provide any output validation. 
+Blocks are mainly useful for small data sizes.
+
+Streams are a collection of independent blocks, which each have checksums and EOF checks, 
+which ensures against corruption and truncation.
+
+3 compression levels are provided:
+
+* Level 1, "Fastest": Provides the fastest compression with reasonable compression. 
+* Level 2, "Balanced": Provides a good balance between compression and speed. ~50% the speed of the fastest level.
+* Level 3, "Smallest": Provides the smallest output possible. Not tuned for speed.
+
+A secondary option to control speed/compression is adjusting the block size.
+See "Writer Block Size" section below.
+
+## Blocks
+
+MinLZ provides a block encoding interface with blocks up to 8MB.
+Blocks do not perform any data integrity check of the content, 
+so additional checksum is recommended.
+
+A basic roundtrip looks like this:
+
+```Go
+   compressed, err := minlz.Encode(nil, src, minlz.LevelBalanced)
+   if err != nil {
+       // Handle error
+   }
+   
+    decompressed, err := minlz.Decode(nil, compressed)
+    if err != nil {
+        // Handle error 
+    }
+```
+
+In both cases, a destination buffer can be provided, which will be overwritten. 
+If the destination buffer is too small, an appropriately sized buffer will be allocated and returned.
+
+It is possible to get the decompressed buffer size by using `minlz.DecodedLen(block []byte) (int, error)`.
+
+You can use the predefined `LevelFastest`, `LevelBalanced` or `LevelSmallest` which correspond to
+levels 1,2 and 3 respectively.
+
+MinLZ does not track the compressed size of buffers and the decode input must match the output exactly.
+Extra bytes given to decompression will return an error.
+
+It is possible to use `minlz.TryEncode`, which will only return compressed bytes if the output size 
+is strictly less than input.
+Use `minlz.AppendEncoded` and `minlz.AppendDecoded` to append to existing slices.
+
+## Streams
+
+Streams provide much more safety and allow for unlimited length encoding, 
+as well as seeking and concurrent encoding/decoding.
+
+Generally, you do not need buffering on the input or output side as reads and writes 
+are done in rather big blocks. 
+Reading and writing data on streams are buffered, 
+and only non-concurrent will block for input/output.    
+
+When dealing with many streams, it is recommended to re-use the Readers and Writers.
+If you are dealing with short streams, consider limiting the concurrency, so 
+`block_size * concurrency` doesn't exceed the expected stream size.
+
+### Encoding
+
+Streams are the recommended way to use MinLZ.
+They provide end-to-end validation against corruption and truncation.
+
+```Go
+    // Create a new stream encoder.
+    // The encoder will write to the provided io.Writer.
+    enc := minlz.NewWriter(output)
+	
+	// We defer a call to Close.
+	// This will flush any pending data and indicate we have reached the end of the stream.
+	defer enc.Close()
+
+	// Write data to the encoder.
+	// The encoder will write the compressed data to the underlying io.Writer.
+	js := json.NewEncoder(enc)
+	err := js.Encode(data)
+```
+
+Encoders can be reused by calling `Reset` on them with another output.
+This will reset the encoder to its initial state.
+
+The encoder supports the [io.ReaderFrom](https://pkg.go.dev/io#ReaderFrom) interface, 
+which can be used for encoding data from an io.Reader. 
+This will typically be faster than writing data to the encoder, since it avoids a memory copy.
+
+If you have a single big buffer to encode, you can use the `EncodeBuffer([]byte) error` 
+to encode it. This will encode the buffer with minimal overhead.
+If you plan to do multiple writes, use the regular `Write` function.
+
+### Options
+
+There are various options that can be set on the stream encoder.
+This can be used to control resource usage on compression and some aspects of decompression.
+If invalid options are set, the encoder will return an error when used.
+
+We will cover the most common options here. Refer to the godoc for a complete list.
+
+#### Writer Compression Level 
+
+The `WriterLevel` option controls the compression level of the stream encoder.
+
+You can use the predefined `LevelFastest`, `LevelBalanced` or `LevelSmallest` which correspond to 
+levels 1,2 and 3 respectively.
+
+Setting level 0 will disable compression and write the data as an uncompressed stream.
+
+The default level is `LevelBalanced`.
+
+#### Writer Block Size
+
+The `WriterBlockSize` allows to set the maximum size of each block on the stream encoder.
+The blocksize - rounded up to a power of 2 - is communicated in the stream, and 
+the decoder will use this to allocate memory during decompression.
+
+Smaller blocks will take up less memory on both compression and decompression, 
+but will result in a larger output. 
+
+Block size further allows trading off speed vs. size; Here is a sample chart of how 
+speed and block size can correlate, using the fastest encoder setting:
+
+| Block Size | Output Size   | E MB/s | Size | E Speed | D Speed |
+|------------|---------------|--------|------|---------|---------|
+| 8MB        | 840,198,535   | 6419   | 100% | 100%    | 100%    |
+| 4MB        | 862,923,396   | 8470   | 103% | 132%    | 124%    |
+| 2MB        | 921,750,327   | 9660   | 110% | 150%    | 131%    |
+| 1MB        | 950,153,883   | 10407  | 113% | 162%    | 125%    |
+| 512KB      | 1,046,061,990 | 11459  | 125% | 179%    | 113%    |
+
+Input is a `3,325,605,752` byte [CSV file](https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst)
+compressed on a 16 core CPU.
+
+The actual scaling mostly depends on the amount of CPU L2 cache (speed) 
+and the nature of the compressed data (size). 
+
+Decompression speed is affected similarly, but less predictably, 
+since it is more likely to be limited by memory throughput, 
+and larger output also tends to affect it more negatively.
+
+If your software is very sensitive to GC stoppages, also note that with assembly 
+single block de/compression cannot be pre-empted, so stop-the-world events may take 
+longer on bigger blocks.
+
+The default block size is 2 MB.
+
+#### Writer Concurrency
+
+The `WriterConcurrency` option allows setting the number of concurrent blocks that can be compressed.
+Higher concurrency will increase the throughput of the encoder, but will also increase memory usage. 
+
+If `WriterConcurrency(1)` is used no async goroutines will be used and the encoder will run in the calling goroutine.
+
+The default concurrency is `GOMAXPROCS`.
+
+### Decoding
+
+Decoding streams mostly just involves sending the compressed stream to a Reader.
+
+Anything accepting an `io.Reader` as input will then be able to read the decompressed data.
+
+```Go
+	// Create a new stream decoder. 
+	// The encoder will read from the provided io.Reader.
+	dec := minlz.NewReader(input)
+	
+	// Read decompressed input.
+	js := json.NewDecoder(dec)
+	err := js.Decode(&data)
+```
+
+If you would like the output to be written to an `io.Writer`, the easiest is to use
+the `WriteTo` functionality.
+
+```Go
+	// Our input and output
+	in, _ := os.Create("input.mz")
+	out, _ := os.Create("output.txt")
+	
+	// Create a new stream decoder
+	dec := minlz.NewReader(in)
+
+	// Write all decompressed data to output
+	n, err := dec.WriteTo(out)
+	fmt.Println("Wrote", n, "bytes. Error:", err)
+```
+
+The `DecompressConcurrent` has similar functionality to `WriteTo`, but allows specifying the concurrency.
+By default `WriteTo` uses `runtime.NumCPU()` or at most 8 concurrent decompressors.
+Besides offering higher throughput using `DecompressConcurrent` will also make input reads async when used. 
+
+For memory-sensitive systems, the maximum block size can be set below 8MB. For this use the `ReaderMaxBlockSize(int)`
+option.
+
+#### Skipping and Seeking
+
+Streams can be skipped forward by calling `(*Reader).Skip(n int64) error`.
+This will skip forward in the stream by `n` bytes. 
+Intermediate blocks be read, but will not be decompressed unless the skip ends inside the block.
+
+Full random seeking is supported by using an *index*. An index can be created when the stream is encoded.
+The index can either be added to the stream or stored separately. 
+For existing streams the `IndexStream(r io.Reader) ([]byte, error)` function can be used to create an index.
+
+To add an index at the end of streams, use the `WriterAddIndex()` option when creating the writer, 
+then the index will be added to the stream when it is closed.
+To keep the index separate, use the `(*Writer).CloseIndex() ([]byte, error)` method to retrieve 
+the index when finishing a stream.
+
+To get a fully seekable reader use `(*Reader).ReadSeeker(index []byte) (*ReadSeeker, error)`.
+The returned reader will implement `io.Seeker`, `io.ReaderAt` in addition to the existing `Reader` methods
+and can be used to seek to any position in the stream.
+
+If an index is not provided in the call, the reader will attempt to read the index from the end of the stream.
+If the input stream does not support `io.Seeker` an error will be returned.
+
+## Custom User Data
+
+Streams can contain user-defined data, that isn't part of the stream. 
+Each "chunk" has an ID, which allows for processing of different types.
+
+This data can either be "skippable" - meaning it is ignored if the user hasn't provided a handler for these.
+If the chunk is non-skippable, the encoder will error out if this chunk isn't handled by the user.
+
+`MinUserSkippableChunk` is the minimum chunk id with user data and `MaxUserSkippableChunk` is the maximum.
+
+`MinUserNonSkippableChunk` is the minimum ID that will not automatically be skipped if unhandled by the user. 
+Finally `MaxUserNonSkippableChunk` is the final ID that can be used for this.
+
+The custom data will not be compressed or modified in any way.
+
+```go
+func ExampleWriterAddUserChunk() {
+	var buf bytes.Buffer
+	w := minlz.NewWriter(&buf)
+	// Add a skippable chunk
+	w.AddUserChunk(minlz.MinUserSkippableChunk, []byte("Chunk Custom Data"))
+	// Write content to stream.
+	w.Write([]byte("some data"))
+	w.Close()
+
+	// Read back what we wrote.
+	r := minlz.NewReader(&buf)
+	r.SkippableCB(minlz.MinUserSkippableChunk, func(sr io.Reader) error {
+		b, err := io.ReadAll(sr)
+		fmt.Println("Callback:", string(b), err)
+		return err
+	})
+
+	// Read stream data
+	b, err := io.ReadAll(r)
+	fmt.Println("Stream data:", string(b))
+
+	//OUTPUT:
+	//Callback: Chunk Custom Data <nil>
+	//Stream data: some data
+}
+```
+
+The maximum single chunk size is 16MB, but as many chunks as needed can be added. 
+
+## Build Tags
+
+The following build tags can be used to control which speed improvements are used:
+
+* `noasm` disables all assembly.
+* `nounsafe` disables all use of unsafe package.
+* `purego` disables assembly and unsafe usage.  
+
+Using assembly/non-assembly versions will often produce slightly different output.
+
+We will support 2 releases prior to current Go release version.  
+
+This package has been extensively fuzz tested to ensure that no data input can cause 
+crashes or excessive memory usage.
+
+When doing fuzz testing, use `-tags=nounsafe`. Non-assembly functions will also be tested, 
+but for completeness also test with `-tags=purego`.
+
+# Performance
+
+## BLOCKS
+
+Individual block benchmarks should be considered carefully - and can be hard to generalize, 
+since they tend to over-emphasize specific characteristics of the content.
+
+Therefore, it will be easy to find counter-examples to the benchmarks, where specific patterns suit a 
+specific compressor better than others. 
+We present a few examples from the [Snappy benchmark set](https://github.com/google/snappy/tree/main/testdata).
+As a benchmark this set has an over-emphasis on text files.
+
+Blocks are compressed/decompress using 16 concurrent threads on an AMD Ryzen 9 3950X 16-Core Processor.
+Click below to see some sample benchmarks compared to Snappy and LZ4:
+
+### Protobuf Sample
+
+
+| Compressor   | Size   | Comp MB/s | Decomp MB/s | Reduction % |
+|--------------|--------|----------:|-------------|-------------|
+| MinLZ 1      | 17,613 |    27,837 |     116,762 |      85.15% |
+| MinLZ 1 (Go) | 17,479 |    22,036 |      61,652 |      85.26% |
+| MinLZ 2      | 16,345 |    12,797 |     103,100 |      86.22% |
+| MinLZ 2 (Go) | 16,345 |     9,732 |      52,964 |      86.22% |
+| MinLZ 3      | 14,766 |       210 |     126,385 |      87.55% |
+| MinLZ 3 (Go) | 14,766 |           |      68,411 |      87.55% |
+| Snappy       | 23,335 |    24,052 |      61,002 |      80.32% |
+| Snappy (Go)  | 23,335 |    10,055 |      35,699 |      80.32% |
+| LZ4 0        | 18,766 |    12,649 |     137,553 |      84.18% |
+| LZ4 0 (Go)   | 18,766 |           |      64,092 |      84.18% |
+| LZ4 9        | 15,844 |    12,649 |     139,801 |      86.64% |
+| LZ4 9 (Go)   | 15,844 |           |      66,904 |      86.64% |
+
+![Compression vs Size](img/pb-block.png)
+
+Source file: https://github.com/google/snappy/blob/main/testdata/geo.protodata
+
+
+### HTML Sample
+
+<details>
+  <summary>Click To See Data + Charts (102,400 bytes input)</summary>
+
+| Compressor   | Size   | Comp MB/s | Decomp MB/s | Reduction % |
+|--------------|--------|----------:|-------------|-------------|
+| MinLZ 1      | 20,184 |    17,558 |      82,292 |      80.29% |
+| MinLZ 1 (Go) | 19,849 |    15,035 |      32,327 |      80.62% |
+| MinLZ 2      | 17,831 |     9,260 |      58,432 |      82.59% |
+| MinLZ 2 (Go) | 17,831 |     7,524 |      25,728 |      82.59% |
+| MinLZ 3      | 16,025 |       180 |      80,445 |      84.35% |
+| MinLZ 3 (Go) | 16,025 |           |      33,382 |      84.35% |
+| Snappy       | 22,843 |    17,469 |      44,765 |      77.69% |
+| Snappy (Go)  | 22,843 |     8,161 |      21,082 |      77.69% |
+| LZ4 0        | 21,216 |     9,452 |     101,490 |      79.28% |
+| LZ4 0 (Go)   | 21,216 |           |      40,674 |      79.28% |
+| LZ4 9        | 17,139 |     1,407 |      95,706 |      83.26% |
+| LZ4 9 (Go)   | 17,139 |           |      39,709 |      83.26% |
+
+![Compression vs Size](img/html-block.png)
+
+Source file: https://github.com/google/snappy/blob/main/testdata/html
+
+</details>
+
+### URL List Sample
+
+<details>
+  <summary>Click To See Data + Charts (702,087 bytes input)</summary>
+
+| Compressor   | Size    | Comp MB/s | Decomp MB/s | Reduction % |
+|--------------|---------|----------:|-------------|-------------|
+| MinLZ 1      | 268,803 |     9,774 |      30,961 |      61.71% |
+| MinLZ 1 (Go) | 260,937 |     7,935 |      17,362 |      62.83% |
+| MinLZ 2      | 230,280 |     5,197 |      26,871 |      67.20% |
+| MinLZ 2 (Go) | 230,280 |     4,280 |      13,926 |      67.20% |
+| MinLZ 3      | 207,303 |       226 |      28,716 |      70.47% |
+| MinLZ 3 (Go) | 207,303 |           |      15,256 |      70.47% |
+| Snappy       | 335,492 |     9,398 |      24,207 |      52.22% |
+| Snappy (Go)  | 335,492 |     4,683 |      12,359 |      52.22% |
+| LZ4 0        | 299,342 |     4,462 |      51,220 |      57.36% |
+| LZ4 0 (Go)   | 299,342 |           |      23,242 |      57.36% |
+| LZ4 9        | 252,182 |       638 |      45,295 |      64.08% |
+| LZ4 9 (Go)   | 252,182 |           |      16,240 |      64.08% |
+
+![Compression vs Size](img/urls-block.png)
+
+Source file: https://github.com/google/snappy/blob/main/testdata/urls.10K
+
+</details>
+
+### Serialized GEO data Sample
+
+<details>
+  <summary>(184,320 bytes input)</summary>
+
+| Compressor   | Size   | Comp MB/s | Decomp MB/s | Reduction % |
+|--------------|--------|----------:|-------------|-------------|
+| MinLZ 1      | 63,595 |     8,319 |      26,170 |      65.50% |
+| MinLZ 1 (Go) | 62,087 |     7,601 |      12,118 |      66.32% |
+| MinLZ 2      | 54,688 |     5,932 |      24,688 |      70.33% |
+| MinLZ 2 (Go) | 52,752 |     4,690 |      10,566 |      71.38% |
+| MinLZ 3      | 46,002 |       230 |      28,083 |      75.04% |
+| MinLZ 3 (Go) | 46,002 |           |      12,877 |      75.04% |
+| Snappy       | 69,526 |    10,198 |      19,754 |      62.28% |
+| Snappy (Go)  | 69,526 |     5,031 |       8,712 |      62.28% |
+| LZ4 0        | 66,506 |     5,355 |      45,305 |      63.92% |
+| LZ4 0 (Go)   | 66,506 |           |      15,757 |      63.92% |
+| LZ4 9        | 50,439 |        88 |      52,877 |      72.64% |
+| LZ4 9 (Go)   | 50,439 |           |      18,171 |      72.64% |
+
+![Compression vs Size](img/geo-block.png)
+
+Source file: https://github.com/google/snappy/blob/main/testdata/kppkn.gtb
+
+</details>
+
+In overall terms, we typically observe that:
+
+* The fastest mode typically beats LZ4 both in speed and output size.
+* The fastest mode is typically equal to Snappy in speed, but significantly smaller.
+* The "balanced" mode typically beats the best possible LZ4 compression, but much faster.
+* Without assembler MinLZ is mostly the fastest option for compression.
+* LZ4 is decompression speed king.
+* Snappy decompression is usually slowest — especially without assembly.
+
+We encourage you to do your own testing with realistic blocks.
+
+You can use `λ mz c -block -bench=10 -verify -cpu=16 -1 file.ext` with our commandline tool to test speed of block encoding/decoding.
+
+## STREAMS
+
+For fair stream comparisons, we run each encoder at its maximum block size
+or max 4MB,  while maintaining independent blocks where it is an option.
+We use the concurrency offered by the package.
+
+This means there may be further speed/size tradeoffs possible for each, 
+so experiment with fine tuning for your needs.
+
+Blocks are compressed/decompress using 16 core AMD Ryzen 9 3950X 16-Core Processor.
+
+### JSON Stream
+
+Input Size: 6,273,951,764 bytes
+
+| Compressor  | Speed MiB/s |          Size | Reduction | Dec MiB/s |
+|-------------|------------:|--------------:|----------:|----------:|
+| MinLZ 1     |      14,921 |   974,656,419 |    84.47% |     3,204 |
+| MinLZ 2     |       8,877 |   901,171,279 |    85.64% |     3,028 |
+| MinLZ 3     |         576 |   742,067,802 |    88.17% |     3,835 |
+| S2 Default  |      15,501 | 1,041,700,255 |    83.40% |     2,378 |
+| S2 Better   |       9,334 |   944,872,699 |    84.94% |     2,300 |
+| S2 Best     |         732 |   826,384,742 |    86.83% |     2,572 |
+| LZ4 Fastest |       5,860 | 1,274,297,625 |    79.69% |     2,680 |
+| LZ4 Best    |       1,772 | 1,091,826,460 |    82.60% |     2,694 |
+| Snappy      |         951 | 1,525,176,492 |    75.69% |     1,828 |
+| Gzip L5     |         236 |   938,015,731 |    85.05% |       557 |
+
+![Compression vs Size](img/json-v1-comp.png)
+![Decompression Speed](img/json-v1-decomp.png)
+
+Source file: https://files.klauspost.com/compress/github-june-2days-2019.json.zst
+
+
+### CSV Stream
+
+<details>
+  <summary>Click To See Data + Charts</summary>
+
+Input Size: 3,325,605,752 bytes
+
+| Compressor | Speed MiB/s | Size          | Reduction |
+|------------|-------------|---------------|-----------|
+| MinLZ 1    | 9,193       |   937,136,278 |    72.07% |
+| MinLZ 2    | 6,158       |   775,823,904 |    77.13% |
+| MinLZ 3    | 338         |   657,162,410 |    80.66% |
+| S2 Default | 10,679      | 1,093,516,949 |    67.12% |
+| S2 Better  | 6,394       |   884,711,436 |    73.40% |
+| S2 Best    | 400         |   773,678,211 |    76.74% |
+| LZ4 Fast   | 4,835       | 1,066,961,737 |    67.92% |
+| LZ4 Best   | 732         |   903,598,068 |    72.83% |
+| Snappy     | 553         | 1,316,042,016 |    60.43% |
+| Gzip L5    | 128         |   767,340,514 |    76.93% |
+
+![Compression vs Size](img/csv-v1-comp.png)
+
+Source file: https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
+
+</details>
+
+### Log data
+
+<details>
+  <summary>Click To See Data + Charts</summary>
+
+Input Size: 2,622,574,440 bytes
+
+| Compressor | Speed MiB/s | Size        | Reduction |
+|------------|-------------|-------------|-----------|
+| MinLZ 1    | 17,014      | 194,361,157 |    92.59% |
+| MinLZ 2    | 12,696      | 174,819,425 |    93.33% |
+| MinLZ 3    | 1,351       | 139,449,942 |    94.68% |
+| S2 Default | 17,131      | 230,521,260 |    91.21% |
+| S2 Better  | 12,632      | 217,884,566 |    91.69% |
+| S2 Best    | 1,687       | 185,357,903 |    92.93% |
+| LZ4 Fast   | 6,115       | 216,323,995 |    91.75% |
+| LZ4 Best   | 2,704       | 169,447,971 |    93.54% |
+| Snappy     | 1,987       | 290,116,961 |    88.94% |
+| Gzip L5    | 498         | 142,119,985 |    94.58% |
+
+![Compression vs Size](img/logs-v1-comp.png)
+
+Source file: https://files.klauspost.com/compress/apache.log.zst
+
+</details>
+
+### Serialized Data
+
+<details>
+  <summary>Click To See Data + Charts</summary>
+
+Input Size: 1,862,623,243 bytes
+
+| Compressor | Speed MiB/s | Size        | Reduction |
+|------------|-------------|-------------|-----------|
+| MinLZ 1    | 10,701      | 604,315,773 |    67.56% |
+| MinLZ 2    | 5,712       | 517,472,464 |    72.22% |
+| MinLZ 3    | 250         | 480,707,192 |    74.19% |
+| S2 Default | 12,167      | 623,832,101 |    66.51% |
+| S2 Better  | 5,712       | 568,441,654 |    69.48% |
+| S2 Best    | 324         | 553,965,705 |    70.26% |
+| LZ4 Fast   | 5,090       | 618,174,538 |    66.81% |
+| LZ4 Best   | 617         | 552,015,243 |    70.36% |
+| Snappy     | 929         | 589,837,541 |    68.33% |
+| Gzip L5    | 166         | 434,950,800 |    76.65% |
+
+![Compression vs Size](img/msgp-v1-comp.png)
+
+Source file: https://files.klauspost.com/compress/github-ranks-backup.bin.zst
+
+</details>
+
+### Backup (Mixed) Data
+
+<details>
+  <summary>Click To See Data + Charts</summary>
+
+Input Size: 10,065,157,632 bytes
+
+| Compressor  | Speed MiB/s | Size          | Reduction |
+|-------------|-------------|---------------|-----------|
+| MinLZ 1     | 9,356       | 5,859,748,636 |    41.78% |
+| MinLZ 2     | 5,321       | 5,256,474,340 |    47.78% |
+| MinLZ 3     | 259         | 4,855,930,368 |    51.76% |
+| S2 Default  | 10,083      | 5,915,541,066 |    41.23% |
+| S2 Better   | 5,731       | 5,455,008,813 |    45.80% |
+| S2 Best     | 319         | 5,192,490,222 |    48.41% |
+| LZ4 Fastest | 5,065       | 5,850,848,099 |    41.87% |
+| LZ4 Best    | 287         | 5,348,127,708 |    46.86% |
+| Snappy      | 732         | 6,056,946,612 |    39.82% |
+| Gzip L5     | 171         | 4,916,436,115 |    51.15% |
+
+![Compression vs Size](img/10gb-v1-comp.png)
+
+Source file: https://mattmahoney.net/dc/10gb.html
+
+</details>
+
+Our conclusion is that the new compression algorithm provides a good compression increase,
+while retaining the ability to saturate pretty much any IO either with compression or
+decompression given a moderate amount of CPU cores.
+
+
+## Why is concurrent block and stream speed so different?
+
+In most cases, MinLZ will be limited by memory bandwidth.
+
+Since streams consist of mostly "unseen" data, it will often mean that memory
+reads are outside any CPU cache.
+
+Contrast that to blocks, where data has often just been read/produced and therefore
+already is in one of the CPU caches.
+Therefore, block (de)compression will more often take place with data read from cache
+rather than a stream, where data can be coming from memory.
+
+Even if data is streamed into cache, the "penalty" will still have to paid at some
+place in the chain. So streams will mostly appear slower in benchmarks.
+
+
+# Commandline utility
+
+Official releases can be downloaded from the [releases](https://github.com/minio/minlz/releases) section
+with binaries for most platforms.
+
+To install from source execute `go install github.com/minio/minlz/cmd/mz@latest`.
+
+## Usage
+
+```
+λ mz
+MinLZ compression tool vx.x built at home, (c) 2025 MinIO Inc.
+Homepage: https://github.com/minio/minlz
+
+Usage:
+Compress:     mz c [options] <input>
+Decompress:   mz d [options] <input>
+ (cat)    :   mz cat [options] <input>
+ (tail)   :   mz tail [options] <input>
+
+Without options 'c' and 'd' can be omitted. Extension decides if decompressing.
+
+Compress file:    mz file.txt
+Compress stdin:   mz -
+Decompress file:  mz file.txt.mz
+Decompress stdin: mz d -
+```
+
+Note that all option sizes KB, MB, etc. are base 1024 in the commandline tool.
+
+Speed indications are base 10.
+
+### Compressing
+
+<details>
+  <summary>Click To Compression Help</summary>
+
+```
+Usage: mz c [options] <input>
+
+Compresses all files supplied as input separately.
+Output files are written as 'filename.ext.mz.
+By default output files will be overwritten.
+Use - as the only file name to read from stdin and write to stdout.
+
+Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+File names beginning with 'http://' and 'https://' will be downloaded and compressed.
+Only http response code 200 is accepted.
+
+Options:
+  -1    Compress faster, but with a minor compression loss
+  -2    Default compression speed (default true)
+  -3    Compress more, but a lot slower
+  -bench int
+        Run benchmark n times. No output will be written
+  -block
+        Use as a single block. Will load content into memory. Max 8MB.
+  -bs string
+        Max block size. Examples: 64K, 256K, 1M, 8M. Must be power of two and <= 8MB (default "8M")
+  -c    Write all output to stdout. Multiple input files will be concatenated
+  -cpu int
+        Maximum number of threads to use (default 32)
+  -help
+        Display help
+  -index
+        Add seek index (default true)
+  -o string
+        Write output to another file. Single input file only
+  -pad string
+        Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1")
+  -q    Don't write any output to terminal, except errors
+  -recomp
+        Recompress MinLZ, Snappy or S2 input
+  -rm
+        Delete source file(s) after success
+  -safe
+        Do not overwrite output files
+  -verify
+        Verify files, but do not write output
+
+Example:
+
+λ mz c apache.log
+Compressing apache.log -> apache.log.mz 2622574440 -> 170960982 [6.52%]; 4155.2MB/s
+```
+</details>
+
+## Decompressing
+
+<details>
+  <summary>Click To Decompression Help</summary>
+
+```
+Usage: mz d [options] <input>
+
+Decompresses all files supplied as input. Input files must end with '.mz', '.s2' or '.sz'.
+Output file names have the extension removed. By default output files will be overwritten.
+Use - as the only file name to read from stdin and write to stdout.
+
+Wildcards are accepted: testdir/*.txt will decompress all files in testdir ending with .txt
+Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
+
+File names beginning with 'http://' and 'https://' will be downloaded and decompressed.
+Extensions on downloaded files are ignored. Only http response code 200 is accepted.
+
+Options:
+  -bench int
+        Run benchmark n times. No output will be written
+  -block
+        Decompress single block. Will load content into memory. Max 8MB.
+  -block-debug
+        Print block encoding
+  -c    Write all output to stdout. Multiple input files will be concatenated
+  -cpu int
+        Maximum number of threads to use (default 32)
+  -help
+        Display help
+  -limit string
+        Return at most this much data. Examples: 92, 64K, 256K, 1M, 4M        
+  -o string
+        Write output to another file. Single input file only
+  -offset string
+        Start at offset. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
+  -q    Don't write any output to terminal, except errors
+  -rm
+        Delete source file(s) after success
+  -safe
+        Do not overwrite output files
+  -tail string
+        Return last of compressed file. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
+  -verify
+        Verify files, but do not write output
+
+Example:
+
+λ mz d apache.log.mz
+Decompressing apache.log.mz -> apache.log 170960982 -> 2622574440 [1534.02%]; 2660.2MB/s
+```
+</details>
+
+Tail, Offset and Limit can be made to forward to the next newline by adding `+nl`.
+
+For example `mz d -c -offset=50MB+nl -limit=1KB+nl enwik9.mz` will skip 50MB, 
+search for the next newline, start outputting data. 
+After 1KB, it will stop at the next newline.
+
+Partial files - decoded with tail, offset or limit will have `.part` extension.
+
+# Snappy/S2 Compatibility
+
+MinLZ is designed to be easily upgradable from [Snappy](https://github.com/google/snappy) 
+and [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression).
+
+Both the streaming and block interfaces in the Go port provide seamless
+compatibility with existing Snappy and S2 content.
+This means that any content encoded with either will be decoded correctly by MinLZ.
+
+Content encoded with MinLZ cannot be decoded by Snappy or S2.  
+
+| Version        | Snappy Decoder | S2 Decoder | MinLZ Decoder |
+|----------------|----------------|------------|---------------|
+| Snappy Encoder | ✔              | ✔          | ✔ (*)         |
+| S2 Encoder     | x              | ✔          | ✔ (*)         |
+| MinLZ Encoder  | x              | x          | ✔             |
+
+(*) MinLZ decoders *may* implement fallback to S2/Snappy. 
+This is however not required and ports may not support this.
+
+# License
+
+MinLZ is Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Based on code from [snappy-go](https://github.com/golang/snappy) project.
+
+# Ports
+
+Reference code is provided in the `internal/reference` folder.
+This provides simplified, but explicit versions of the block de/encoder;
+stream and index decoders with minimal dependencies.
+
+Currently, there are no ports of MinLZ to other languages. 
+If you are interested in porting MinLZ to another language, open a discussion topic.
+
+If you do a port, feel free to send in a PR for this table:
+
+| Language | Repository Link                                                                         | License    | Block Read | Block Write | Stream Read | Stream Write | Index | Snappy Fallback |
+|----------|-----------------------------------------------------------------------------------------|------------|------------|-------------|-------------|--------------|-------|-----------------|
+| Go       | [github.com/minio/minlz](https://github.com/minio/minlz)                                | Apache 2.0 | ✅          | ✅           | ✅           | ✅            | ✅     | ✅               |  
+| C        | [Experimental GIST](https://gist.github.com/klauspost/5796a5aa116a15eb7341ffa8427bbe7a) | CC0        | ✅          | ✅           |             |              |       |                 |                                                                                                                 
+
+
+Indicated features must support all parts of each feature as described in the specification.
+However, it is up to the implementation to decide the encoding implementation(s).  
diff --git a/vendor/github.com/minio/minlz/SPEC.md b/vendor/github.com/minio/minlz/SPEC.md
new file mode 100644
index 0000000000..76c3335360
--- /dev/null
+++ b/vendor/github.com/minio/minlz/SPEC.md
@@ -0,0 +1,641 @@
+# MINLZ FORMAT SPECIFICATION V1.0.0
+
+All implementations are requested to state: "This implements the MinLZ specification v1.0"
+
+Furthermore, if a subset of features is supported, it should state this clearly.
+
+A reference decoder is provided. If there is any ambiguity in the specification, 
+the behavior of the reference decoder should be followed.
+
+The spec versioning follows [semantic versioning](https://semver.org/).
+
+* Major version numbers indicate breaking changes.
+* Minor version numbers indicate added functionality that will not be readable by previous versions.
+* Patch version numbers indicate non-breaking additions to the spec.
+
+# BLOCK FORMAT
+
+MinLZ is a LZ77-style compressor with a fixed size, byte-oriented encoding.
+
+All values are encoded with full-byte offsets as an interleaved stream, 
+where operations and literals are intermixed, similar to LZ4 and Snappy.
+
+This specification defines the decoding format. 
+The encoding of specific content is not defined and may change for any implementation.
+
+The basic structure is similar to Snappy, but most encodings have been adjusted.
+
+## 1.0 MinLZ Indicator
+
+An initial byte of value 0 indicates that this is MinLZ encoded block.
+
+A 0-byte block is allowed, and is encoded as a single byte with a 0 value.
+
+If the first byte is not 0, this can be used to handle seamless 
+fallback to Snappy.
+Decoders *may* implement fallback to Snappy and provide seamless decoding
+of the block if this value is non-zero.
+
+## 1.1 Block Size 
+
+A block starts with the uncompressed length up to a maximum of 2^24,
+stored as an unsigned varint.
+
+Maximum uncompressed block size is 8 MiB = 8,388,608 bytes.
+
+If this value is 0, the rest of the block should be emitted as literals.
+
+A compressed block may not be bigger than the decompressed block after reading the header.
+
+## 2 MinLZ Encoding
+
+Each block is encoded as a sequence of literals and copy/repeat commands.
+
+Each element starts with a tag byte, and the lower two bits of this tag 
+byte signal what type of element will follow:
+
+| Tag | Meaning                              |
+|-----|--------------------------------------|
+| 0   | Literal(s) or Repeat previous offset |
+| 1   | Copy with 10-bit offset              |
+| 2   | Copy with 16-bit offset              |
+| 3   | Copy with up to 21-bit offset        |
+
+The interpretation of the upper six bits is tag-dependent.
+
+Values spanning multiple bytes are stored in little-endian order.
+
+### 2.1 Literals/Repeat (tag 00)
+
+The literal length is stored differently depending on the length
+of the literal block.
+
+Optionally, instead of literals, a repeat copy can be specified, 
+copying from the previous offset, or 1 if at the beginning of block
+
+The value of the tag byte should be interpreted as follows:
+
+| Bits | Meaning | Description                        |
+|------|---------|------------------------------------|
+| 0-1  | Tag     | Always 0                           |
+| 2    | Repeat  | When set do repeat copy            |
+| 3-7  | Length  | Length of literal or repeat block. |
+
+Length follows the tag, and is encoded like this: 
+
+| Value | Literals          | 
+|-------|-------------------|
+| 0-28  | 1 + Value         | 
+| 29    | 30 + Read 1 byte  |
+| 30    | 30 + Read 2 bytes | 
+| 31    | 30 + Read 3 bytes |
+
+Repeats are handled as copies, but with no offset specified. 
+See below how offsets are handled. 
+
+Literals follow the tag or extended length field. 
+
+### 2.2 Copies
+
+Copies are references back into previous decompressed data, telling
+the decompressor to reuse data it has previously decoded.
+They encode two values: The _offset_, saying how many bytes back
+from the current position to read, and the _length_, how many bytes
+to copy.
+
+Backreferences that go past the end of the block 
+(offset > current decompressed position) are not allowed.
+
+As in most LZ77-based compressors, the length can be larger than the offset,
+yielding a form of run-length encoding (RLE). For instance,
+"xababab" could be encoded as
+
+`<literal: "xab"> <copy: offset=2 length=4>`
+
+There are several different kinds of copy elements, depending on
+the number of bytes to be copied (length), and how far back the
+data to be copied is (offset).
+
+Furthermore, a "repeat" can be emitted, which will use the last offset for a copy,
+but with a new length. Initial repeat offset is 1. So an initial RLE can be encoded as
+
+`<start of block> <literal: "x"> <repeat: length=4>`
+
+This will emit "xxxxx"
+
+Matches shorter than 4 bytes cannot be represented, except for repeats.
+Longer offset representations have minimum offsets, mainly to help decompression speed.
+
+#### 2.2.1 Fused Literals+Copy
+
+Copies with 2 or 3 byte offset can contain up to 4 literals in their encoding.
+The literals must be emitted before the copy. 
+
+The offset is the offset of the destination *after* the copy, 
+so the copy offset is the same as if the literals were emitted separately.
+
+See section 2.5.1 and 2.5.2 for details on fused copy operations.
+
+Encoding should prefer fused literal+copy when tied for size, since the fused operation
+will typically decode faster than separate operations. 
+
+### 2.3 Copy1 with 1-byte offset (tag 01)
+
+These elements can encode lengths between [4..273] bytes and offsets
+between [1..1024] bytes. 
+
+| Bits | Meaning   | Description                                                  |
+|------|-----------|--------------------------------------------------------------|
+| 0-1  | Tag       | Always 1                                                     |
+| 2-5  | Length    | Length of copy<br/>Values are 0-15. See decoding table below |
+| 6-7  | Offset LB | Lower 2 bits of offset                                       |
+
+| Bits | Meaning   | Description       |
+|------|-----------|-------------------|
+| 0-7  | Offset UB | Offset Upper bits |
+
+Offset is 1 more than stored value. The minimum offset is 1, and the maximum is 1024.
+
+| Value | Output Length    |
+|-------|------------------|
+| 0-14  | 4 + Value        |
+| 15    | 18 + Read 1 byte |
+
+Maximum length is therefore 18 + 255 = 273
+
+The extra length byte is stored *after* the offset byte if present.
+
+Minimum encoded length 2 bytes, max 3 bytes with length 19 -> 273. 
+Longer matches should emit a repeat with the extra bytes, or use Copy2.
+
+### 2.4 Copy2 with 2-byte offset (tag 10)
+
+These elements can encode lengths between [4...] bytes and offsets
+between [64...65599] bytes.
+
+| Bits | Meaning | Description                                                   |
+|------|---------|---------------------------------------------------------------|
+| 0-1  | Tag     | Always 2                                                      |
+| 2-7  | Length  | Length of copy.<br/>Values are 0-63. See decoding table below |
+
+Offsets are encoded as 2 little-endian bytes following the tag.
+The minimum offset is 64 which should be added to the stored value.
+The maximum backreference offset is therefore 65,599.
+
+| Bits  | Meaning     | Description               |
+|-------|-------------|---------------------------|
+| 0-15  | Offset      | Offset + 64 `[64->65599]` |
+
+Lengths are encoded as follows:
+
+| Value | Output            |
+|-------|-------------------|
+| 0-60  | 4 + Value         |
+| 61    | 64 + Read 1 byte  |
+| 62    | 64 + Read 2 bytes |
+| 63    | 64 + Read 3 bytes |
+
+Minimum encoded length 3 bytes, max 6 bytes.
+
+When both Copy1 and Copy2 have similar encoded length (mainly length 19->64), 
+prefer copy2 as decoding will be faster.
+
+### 2.5 Fused Copy2/3 (tag 11)
+
+Tag can contain either a copy with 2 or 3 byte offsets and fused literals.
+
+The third bit indicates if this is a copy2 or copy3.
+
+| Bits | Meaning        | Description                                 |
+|------|----------------|---------------------------------------------|
+| 0-1  | Tag            | Always 3                                    |
+| 2    | Copy3          | 1 when Copy3, 0 when Fused Copy2.           |
+| 3-4  | Literal Length | Number of literals to emit before the copy. | 
+
+The literal length is shared as bit 3-4, but Fused Copy2 has a minimum of 1. 
+
+#### 2.5.1 Fused Copy2
+
+Fused Copy2 offers a short 4->11 byte copy with 16 bit offset, preceded by 1-4 literals. 
+
+| Bits | Meaning                          |
+|------|----------------------------------|
+| 0-1  | Tag. Always 3                    |
+| 2    | Copy3. Always 0                  |
+| 3-4  | Literal length + 1 `[1->4]`      |
+| 5-7  | Copy Length + 4 `[4->11]`        |
+| 8-23 | 16 bit offset + 64 `[64->65599]` |
+| 24-> | 1-4 Literals                     |
+
+For literal + copy2, a 2-byte offset will follow the tag, then the immediate(s) will follow.
+
+A repeat operation can be used to extend the copy.
+
+Encoded fused copy2 is 3 bytes, with 1-4 additional literals.
+
+### 2.5.2 Copy3 (Optionally Fused)
+
+Offsets are encoded as 21 bits, with 0 -> 3 fused literals. 
+The minimum offset is 65,536 and must be added the stored offset.
+The maximum backreference offset is therefore 2,162,687 (1<<21 + 65535).
+
+| Bits  | Meaning                            |
+|-------|------------------------------------|
+| 0-1   | Tag. Always 3                      |
+| 2     | Copy3. Always 1.                   |
+| 3-4   | Literal length `[0->3]`            |
+| 5-10  | Copy Length. 6 bits. (See table)   |
+| 11-31 | 21 bit offset + 65536 `[64K->2MB]` |
+| 32->x | 0-3 Extended Length Bytes          |
+| x->   | 0-3 Literals                       |
+
+Length is encoded as follows:
+
+| Value | Output            |
+|-------|-------------------|
+| 0-60  | 4 + Value         |
+| 61    | 64 + Read 1 byte  |
+| 62    | 64 + Read 2 bytes |
+| 63    | 64 + Read 3 bytes |
+
+Minimum encoded Copy3 length is 4 bytes. 
+Max length is 7 bytes, plus up to 3 literals.
+
+Note that the shortest length Copy3 does not gain anything, 
+except possibly avoid a literal tag, if there are fused literals,
+or a repeat can be set up for copy.
+
+## 3 DICTIONARY FORMAT
+
+TBD.
+
+# STREAM FORMAT
+
+Follows [Snappy Framing Format](https://github.com/google/snappy/blob/main/framing_format.txt) - 
+but with modifications to allow to be easily backwards compatible with Snappy/S2. 
+
+### 1. General structure
+
+The file consists solely of chunks, lying back-to-back with no padding
+in between. Each chunk consists first a single byte of chunk identifier,
+then a three-byte little-endian length of the chunk in bytes (from 0 to
+16,777,215 inclusive), and then the data if any. The four bytes of chunk
+header is not counted in the data length.
+
+The different chunk types are listed below. The first chunk must always
+be the stream identifier chunk (see section 4.1, below).
+
+### 2. File type identification
+
+The following identifiers for this format are recommended where appropriate.
+However, note that none have been registered officially, so this is only to
+be taken as a guideline. 
+
+    File extension:         .mz
+    MIME type:              application/x-minlz-compressed
+    HTTP Content-Encoding:  x-minlz-compressed
+
+Individual blocks contain no corruption detection, so these should not be exchanged.
+However, if software produces them they should use the `.mzb` extension. 
+
+### 3. Checksum format
+
+Some chunks have data protected by a checksum (the ones that do will say so
+explicitly). The checksums are always masked CRC-32Cs.
+
+A description of CRC-32C can be found in [RFC 3720](https://datatracker.ietf.org/doc/html/rfc3720),
+section 12.1, with examples in section B.4.
+
+Checksums are not stored directly, but masked, as checksumming data and
+then its own checksum can be problematic. The masking is the same as used
+in Apache Hadoop: Rotate the checksum by 15 bits, then add the constant
+0xa282ead8 (using wraparound as normal for unsigned integers). This is
+equivalent to the following C code:
+
+```
+uint32_t mask_checksum(uint32_t x) {
+    return ((x >> 15) | (x << 17)) + 0xa282ead8;
+}
+```
+
+Note that the masking is reversible.
+
+The checksum is always stored as a four-byte long integer, in little-endian.
+
+### 4. Chunk types
+
+The currently supported chunk types are described below. The list may
+be extended in the future.
+
+| ID      | Description                   | See Section |
+|---------|-------------------------------|-------------|
+| 0       | (legacy compressed Data)      | 4.3         |
+| 1       | Uncompressed Data             | 4.3         |
+| 2, 3    | MinLZ Compressed Block        | 4.4, 4.5    |
+| 32      | EOF                           | 4.6         |
+| 4-63    | (reserved, non-skippable)     | 4.8         |
+| 64      | Stream Index                  | 4.12        |
+| 65-127  | (reserved, skippable)         | 4.9         |
+| 128-191 | (user defined, skippable)     | 4.10        |
+| 192-253 | (user defined, non-skippable) | 4.11        |
+| 254     | Padding                       | 4.7         |
+| 255     | Stream identifier             | 4.1         |
+
+### 4.1. Stream identifier (chunk type 0xff)
+
+The stream identifier is always the first element in the stream.
+It is exactly six bytes long and starts with "MinLz" in ASCII. 
+This means that a valid MinLZ framed stream always starts with the bytes:
+
+    (type) (length)        M    i    n    L    z
+    0xff   0x06 0x00 0x00  0x4d 0x69 0x6e 0x4c 0x7a <stream info>
+
+The final byte of the identifier is a block size indicator.
+
+| Bits | Description               |
+|------|---------------------------|
+| 0-3  | Max block size indicator  |
+| 4-5  | Reserved, must be ignored |
+| 6-7  | Reserved, must be 0       |
+
+#### 4.1.1 Max Block Size
+
+The value is the log2 of the maximum block size minus 10.
+
+So if the value is `0x4`, the maximum block size is 2^(4+10) (16KiB).
+The maximum block size is 2^23 (8MiB), so the maximum identifier is 13,
+which can also be used if for some reason the maximum block size is not known.
+This size only applies to content frames.
+
+Decoders may choose not to decode streams based on the maximum block size.
+Decoders *must* reject any value > 13.
+
+To allow concatenation, a stream identifier can follow an EOF chunk. 
+
+### 4.2. Uncompressed data (chunk type 0x01)
+
+Uncompressed data chunks allow a compressor to send uncompressed,
+raw data; this is useful if, for instance, incompressible or
+near-incompressible data is detected, and faster decompression is desired.
+
+As in the compressed chunks, the data is preceded by its own masked
+CRC-32C (see section 3).
+
+An uncompressed data chunk minus the CRC should not exceed the maximum block size
+as indicated by the Stream identifier.
+
+### 4.3. Legacy Compressed data (chunk type 0x00) — BACKCOMPAT ONLY
+
+Type 0x00 compressed chunks are not allowed in MinLZ streams.
+
+Instead, use type 0x02, which indicates a MinLZ compressed block.
+
+### 4.4. MinLZ Compressed data (chunk type 0x02)
+
+A MinLZ block *without* the MinLZ identifier (initial 0 byte).
+
+A CRC32C (see section 3) checksum of the *uncompressed* 
+data is stored at the beginning of the block.
+
+Chunks with 0 decompressed bytes are not allowed, 
+as well as blocks with decompressed size less than compressed size.
+
+### 4.5. MinLZ Compressed data - Compressed CRC (chunk type 0x03).
+
+A MinLZ block *without* the MinLZ identifier (initial 0 byte).
+
+A CRC32C (see section 3) checksum of the *compressed* 
+data is stored at the beginning of the block.
+
+Chunks with 0 decompressed bytes are not allowed,
+as well as blocks with decompressed size less than compressed size.
+
+If possible prefer type `0x02` over this.
+
+### 4.6 EOF (chunk type 0x20)
+
+The end of the stream is indicated by a chunk with the ID `0x20`.
+This allows detection of truncated streams.
+
+The output size of the stream is encoded as an unsigned varint as the only content.
+It is allowed to add an empty block to not validate size.
+Maximum chunk size is 10 bytes (64 bit varint encoded).
+
+Encoders should always emit this chunk. 
+Decoders can optionally reject streams that do not have this chunk.
+
+If this is the first chunk in the stream, the stream is empty.
+
+If there are multiple Stream identifier (chunk type 0xff) chunks in the stream,
+each must have a corresponding EOF chunk and the count resets to 0 at each Stream identifier.
+
+### 4.7. Padding (chunk type 0xfe)
+
+Padding chunks allow a compressor to increase the size of the data stream
+so that it complies with external demands, e.g. that the total number of
+bytes is a multiple of some value.
+
+All bytes of the padding chunk, except the chunk byte itself and the length,
+should be zero, but decompressors must not try to interpret or verify the
+padding data in any way.
+
+### 4.8. Reserved unskippable chunks (chunk types 0x04-0x3f)
+
+These are reserved for future expansion. A decoder that sees such a chunk
+should immediately return an error, as it must assume it cannot decode the
+stream correctly.
+
+Future versions of this specification may define meanings for these chunks.
+
+### 4.9. Reserved skippable chunks (chunk types 0x40-0x7f)
+
+These are also reserved for future expansion, but unlike the chunks
+described in 4.5, a decoder seeing these must skip them and continue
+decoding.
+
+Future versions of this specification may define meanings for these chunks.
+
+### 4.10. User defined skippable chunks (chunk types 0x80-0xbf)
+
+These are allowed for user-defined data. A decoder that does not recognize
+the chunk type should skip it and continue decoding.
+
+Users should use additional checks to ensure that the chunk is of the expected type.
+
+Future versions of this specification will not define meanings for these chunks.
+
+### 4.11. User defined non-skippable chunks (chunk types 0xc0-0xfd)
+
+These are allowed for user-defined data. If users do not recognize these chunks,
+decoding should stop.
+
+Users should use additional checks to ensure that the chunk is of the expected type.
+
+Future versions of this specification will not define meanings for these chunks.
+
+
+### 4.12 Index (chunk type 0x40) — OPTIONAL
+
+Each block is structured as a skippable block, with the chunk ID 0x40.
+
+Decoders are free to skip this block.
+
+The block can be read from the front, but contains information
+so it can easily be read from the back as well.
+
+Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding),
+with un-encoded value length of 64 bits, unless other limits are specified.
+
+| Content                              | Format                                                                                                                        |
+|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
+| ID, `[1]byte`                        | Always 0x40.                                                                                                                  |
+| Data Length, `[3]byte`               | 3 byte little-endian length of the chunk in bytes, following this.                                                            |
+| Header `[6]byte`                     | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00".                                                        |
+| UncompressedSize, Varint             | Total Uncompressed size.                                                                                                      |
+| CompressedSize, Varint               | Total Compressed size if known. Should be -1 if unknown.                                                                      |
+| EstBlockSize, Varint                 | Block Size, used for guessing uncompressed offsets. Must be >= 0.                                                             |
+| Entries, Varint                      | Number of Entries in index, must be < 65536 and >=0.                                                                          |
+| HasUncompressedOffsets `byte`        | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid.                                             |
+| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode.                                                                                |
+| CompressedOffsets, [Entries]VarInt   | Compressed offsets. See below how to decode.                                                                                  |
+| Block Size, `[4]byte`                | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block.       |
+| Trailer `[6]byte`                    | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
+
+For regular streams the uncompressed offsets are fully predictable,
+so `HasUncompressedOffsets` allows to specify that compressed blocks all have
+exactly `EstBlockSize` bytes of uncompressed content.
+
+Entries *must* be in order, starting with the lowest offset,
+and there *must* be no uncompressed offset duplicates.  
+Entries *may* point to the start of a skippable block,
+but it is then not allowed to also have an entry for the next block since
+that would give an uncompressed offset duplicate.
+
+There is no requirement for all blocks to be represented in the index.
+In fact there is a maximum of 65535 block entries in an index.
+
+The writer can use any method to reduce the number of entries.
+An implicit block start at 0,0 can be assumed.
+
+### Decoding entries:
+
+```
+// Read Uncompressed entries.
+// Each assumes EstBlockSize delta from previous.
+for each entry {
+    uOff = 0
+    if HasUncompressedOffsets == 1 {
+        uOff = ReadVarInt // Read value from stream
+    }
+   
+    // Except for the first entry, use previous values.
+    if entryNum == 0 {
+        entry[entryNum].UncompressedOffset = uOff
+        continue
+    }
+    
+    // Uncompressed uses previous offset and adds EstBlockSize
+    entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + uOff
+    entryNum++
+}
+
+
+// Guess that the first block will be 50% of uncompressed size.
+// Integer truncating division must be used.
+CompressGuess := EstBlockSize / 2
+
+// Read Compressed entries.
+// Each assumes CompressGuess delta from previous.
+// CompressGuess is adjusted for each value.
+for each entry {
+    cOff = ReadVarInt // Read value from stream
+    
+    // Except for the first entry, use previous values.
+    if entryNum == 0 {
+        entry[entryNum].CompressedOffset = cOff
+        continue
+    }
+    
+    // Compressed uses previous and our estimate.
+    entry[entryNum].CompressedOffset = entry[entryNum-1].CompressedOffset + CompressGuess + cOff
+        
+     // Adjust compressed offset for next loop, integer truncating division must be used. 
+     CompressGuess += cOff/2
+     
+     entryNum++               
+}
+```
+
+To decode from any given uncompressed offset `(wantOffset)`:
+
+* Iterate entries until `entry[n].UncompressedOffset > wantOffset`.
+* Start decoding from `entry[n-1].CompressedOffset`.
+* Discard `entry[n-1].UncompressedOffset - wantOffset` bytes from the decoded stream.
+
+This is similar to S2, except the ID is 0x40 instead of 0x99.
+
+# Implementation Notes
+
+This section contains guidelines for implementation and use. 
+None of these are strict requirements. 
+
+MinLZ is designed for a certain speed/size tradeoff.
+
+It is designed to be used in scenarios where encoding and decoding speed is critical.
+
+Unless decompression speed is critical, it is not designed for long-term storage,
+and formats like [zstandard](https://facebook.github.io/zstd/) should be considered for better compression.
+Formats like xz/bzip2 also offer excellent compression, but these offer even more reduced decompression speed.
+
+## No entropy nor dynamic encoding
+
+MinLZ by design only offers static encoding types and no entropy coding of remainder literals, 
+making decoding possible with no tables.
+
+While it was considered, all conditional decoding (decoding based on previous operation or output position)
+was avoided to simplify the decoder.
+
+## Independent block streams
+
+A primary design choice is to make blocks on streams fully independent to facilitate
+independent compression and decompression.
+
+This will make streams seekable, and even without an index, streams can be 
+skipped forwards without decompression, and blocks can be decoded concurrently.
+
+The maximum block size of 8MB is designed to minimize the size impact of this.
+
+## Speed Optimizations
+
+This section describes tricks that can help achieve maximum speed.
+
+Decompression has been designed to make use of modern CPU branch prediction.
+
+### Margin-specific code
+
+For decompression it can be beneficial to have 2 parts of the decompressors:
+A primary decoding loop, that runs while there is some input and output margin and another
+that deals when at the end of the input/output, which has stricter checks.
+
+### No-overlap Encodings
+
+Minimum offsets to certain encodings mainly exist to avoid overlapping copies and make a 64 byte/loop 
+copy safe to do. 
+
+Copy2/Copy3 operations guarantee there are no copies with an offset less than 64 bytes.
+This means that these can use a bigger copy loop without the need to worry about these.
+
+This effectively moves this branch from the encoder to the decoder.  
+
+### Safe Fused Literals
+
+Fused literals can always be safely copied with a 4 byte copy, 
+since there will always "follow" a match with at least 4 bytes that will "fix up" any extra literals.
+
+Therefore, fused copies are typically faster than a separate literal + copy operation.
+
+### Encoding Tips
+
+ * Prefer fused literals, even when encoded size is equal.
+ * Avoid 1 and 2 byte repeats. The encoding exists mostly for flexibility.
+ * Prefer Copy2 over Copy1 when encoded size is the same.
diff --git a/vendor/github.com/minio/minlz/asm_amd64.go b/vendor/github.com/minio/minlz/asm_amd64.go
new file mode 100644
index 0000000000..d9416e9e39
--- /dev/null
+++ b/vendor/github.com/minio/minlz/asm_amd64.go
@@ -0,0 +1,174 @@
+// Code generated by command: go run gen.go -out ../asm_amd64.s -stubs ../asm_amd64.go -pkg=minlz. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !purego
+
+package minlz
+
+func _dummy_()
+
+// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 8388608 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm(dst []byte, src []byte, tmp *[131072]byte) int
+
+// encodeBlockAsm2MB encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 2097152 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm2MB(dst []byte, src []byte, tmp *[131072]byte) int
+
+// encodeBlockAsm512K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 524288 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm512K(dst []byte, src []byte, tmp *[65536]byte) int
+
+// encodeBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 65536 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm64K(dst []byte, src []byte, tmp *[16384]byte) int
+
+// encodeBlockAsm16K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16384 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm16K(dst []byte, src []byte, tmp *[8192]byte) int
+
+// encodeBlockAsm4K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4096 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm4K(dst []byte, src []byte, tmp *[2048]byte) int
+
+// encodeBlockAsm1K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 1024 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBlockAsm1K(dst []byte, src []byte, tmp *[1024]byte) int
+
+// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 8388608 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int
+
+// encodeBetterBlockAsm2MB encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 2097152 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm2MB(dst []byte, src []byte, tmp *[589824]byte) int
+
+// encodeBetterBlockAsm512K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 524288 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm512K(dst []byte, src []byte, tmp *[294912]byte) int
+
+// encodeBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 65536 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm64K(dst []byte, src []byte, tmp *[73728]byte) int
+
+// encodeBetterBlockAsm16K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 16384 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm16K(dst []byte, src []byte, tmp *[36864]byte) int
+
+// encodeBetterBlockAsm4K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 4096 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm4K(dst []byte, src []byte, tmp *[10240]byte) int
+
+// encodeBetterBlockAsm1K encodes a non-empty src to a guaranteed-large-enough dst.
+// Maximum input 1024 bytes.
+// It assumes that the varint-encoded length of the decompressed bytes has already been written.
+//
+//go:noescape
+func encodeBetterBlockAsm1K(dst []byte, src []byte, tmp *[4608]byte) int
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes with margin of 8 bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
+//
+//go:noescape
+func emitLiteral(dst []byte, lit []byte) int
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+// Length must be at least 4 and < 1<<32
+//
+//go:noescape
+func emitRepeat(dst []byte, length int) int
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+//
+//go:noescape
+func emitCopy(dst []byte, offset int, length int) int
+
+// emitCopyLits2 writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= 65536
+//	4 <= length && length <= MaxBlockSize
+//
+//go:noescape
+func emitCopyLits2(dst []byte, lits []byte, offset int, length int) int
+
+// emitCopyLits3 writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= (1<<21)
+//	4 <= length && length <= MaxBlockSize
+//
+//go:noescape
+func emitCopyLits3(dst []byte, lits []byte, offset int, length int) int
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+//
+//	len(a) <= len(b)
+//
+//go:noescape
+func matchLen(a []byte, b []byte) int
+
+// cvtLZ4Block converts an LZ4 block to MinLZ
+//
+//go:noescape
+func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+
+// decodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
+// It assumes that the varint-encoded length of the decompressed bytes has already been read.
+//
+//go:noescape
+func decodeBlockAsm(dst []byte, src []byte) int
diff --git a/vendor/github.com/minio/minlz/asm_amd64.s b/vendor/github.com/minio/minlz/asm_amd64.s
new file mode 100644
index 0000000000..7af14d0dec
--- /dev/null
+++ b/vendor/github.com/minio/minlz/asm_amd64.s
@@ -0,0 +1,20791 @@
+// Code generated by command: go run gen.go -out ../asm_amd64.s -stubs ../asm_amd64.go -pkg=minlz. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !purego
+
+#include "textflag.h"
+
+// func _dummy_()
+TEXT ·_dummy_(SB), $0
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+	RET
+
+// func encodeBlockAsm(dst []byte, src []byte, tmp *[131072]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBlockAsm(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000400, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -17(DX), BX
+	LEAQ  -17(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm
+	MOVQ  (BX)(DX*1), DI
+	LEAL  -2162685(DX), R8
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R10
+	MOVQ  DI, R11
+	MOVQ  DI, R12
+	SHRQ  $0x08, R12
+	SHLQ  $0x10, R11
+	IMULQ R10, R11
+	SHRQ  $0x31, R11
+	SHLQ  $0x10, R12
+	IMULQ R10, R12
+	SHRQ  $0x31, R12
+	MOVL  (AX)(R11*4), SI
+	MOVL  (AX)(R12*4), R9
+	MOVL  DX, (AX)(R11*4)
+	MOVL  DX, (AX)(R12*4)
+	MOVQ  DI, R11
+	SHRQ  $0x10, R11
+	SHLQ  $0x10, R11
+	IMULQ R10, R11
+	SHRQ  $0x31, R11
+	MOVL  DX, R10
+	SUBL  16(SP), R10
+	MOVL  1(BX)(R10*1), R12
+	MOVQ  DI, R10
+	SHRQ  $0x08, R10
+	CMPL  R10, R12
+	JNE   no_repeat_found_encodeBlockAsm
+	LEAL  1(DX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeBlockAsm
+
+repeat_extend_back_loop_encodeBlockAsm:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeBlockAsm
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeBlockAsm
+
+repeat_extend_back_end_encodeBlockAsm:
+	MOVL DI, SI
+	MOVL 12(SP), R8
+	SUBL R8, SI
+	LEAQ 4(CX)(SI*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_1
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_1:
+	LEAQ (BX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_repeat_emit_lits_encodeBlockAsm
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_repeat_emit_lits_encodeBlockAsm
+	CMPL R9, $0x00010000
+	JB   three_bytes_repeat_emit_lits_encodeBlockAsm
+	MOVL R9, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R9, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm
+
+three_bytes_repeat_emit_lits_encodeBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm
+
+two_bytes_repeat_emit_lits_encodeBlockAsm:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midrepeat_emit_lits_encodeBlockAsm
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm
+
+one_byte_repeat_emit_lits_encodeBlockAsm:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_8through16:
+	MOVOU (R8), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_repeat_emit_lits_encodeBlockAsm:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm
+
+memmove_midrepeat_emit_lits_encodeBlockAsm:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm
+
+memmove_long_repeat_emit_lits_encodeBlockAsm:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  SI, R12
+	JAE   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+repeat_emit_lits_end_encodeBlockAsm:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBlockAsm
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_repeat_extend_encodeBlockAsm:
+	CMPL R8, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBlockAsm
+	JMP  matchlen_match8_repeat_extend_encodeBlockAsm
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_match8_repeat_extend_encodeBlockAsm:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_match4_repeat_extend_encodeBlockAsm:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm
+	JB   repeat_extend_forward_end_encodeBlockAsm
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm
+
+matchlen_match1_repeat_extend_encodeBlockAsm:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitRepeat
+	LEAL -1(SI), DI
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBlockAsm
+	LEAL -30(SI), DI
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBlockAsm
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL DI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_three_match_repeat_encodeBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_two_match_repeat_encodeBlockAsm:
+	MOVB $0xec, (CX)
+	MOVB DI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm
+
+repeat_one_match_repeat_encodeBlockAsm:
+	XORL DI, DI
+	LEAL -4(DI)(SI*8), DI
+	MOVB DI, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBlockAsm:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm
+
+no_repeat_found_encodeBlockAsm:
+	CMPL SI, R8
+	JLE  offset_ok_0_encodeBlockAsm
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm
+
+offset_ok_0_encodeBlockAsm:
+	SHRQ $0x08, DI
+	MOVL (AX)(R11*4), SI
+	LEAL 2(DX), R10
+	CMPL R9, R8
+	JLE  offset_ok_1_encodeBlockAsm
+	CMPL (BX)(R9*1), DI
+	JEQ  candidate2_match_encodeBlockAsm
+
+offset_ok_1_encodeBlockAsm:
+	MOVL R10, (AX)(R11*4)
+	SHRQ $0x08, DI
+	CMPL SI, R8
+	JLE  offset_ok_2_encodeBlockAsm
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm
+
+offset_ok_2_encodeBlockAsm:
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBlockAsm
+
+candidate3_match_encodeBlockAsm:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm
+
+candidate2_match_encodeBlockAsm:
+	MOVL R10, (AX)(R11*4)
+	INCL DX
+	MOVL R9, SI
+
+candidate_match_encodeBlockAsm:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm
+
+match_extend_back_loop_encodeBlockAsm:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm
+	JMP  match_extend_back_loop_encodeBlockAsm
+
+match_extend_back_end_encodeBlockAsm:
+	CMPQ CX, (SP)
+	JB   dst_size_check_ok_2
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_2:
+	MOVL DX, R8
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBlockAsm
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBlockAsm:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBlockAsm
+	JMP  matchlen_match8_match_nolit_encodeBlockAsm
+
+matchlen_bsf_16match_nolit_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBlockAsm
+
+matchlen_match8_match_nolit_encodeBlockAsm:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBlockAsm
+
+matchlen_match4_match_nolit_encodeBlockAsm:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBlockAsm:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm
+	JB   match_nolit_end_encodeBlockAsm
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm
+
+matchlen_match1_match_nolit_encodeBlockAsm:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  match_nolit_end_encodeBlockAsm
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBlockAsm:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL 16(SP), SI
+	MOVL 12(SP), DI
+	MOVL DX, 12(SP)
+	SUBL DI, R8
+	JZ   match_nolits_copy_encodeBlockAsm
+	LEAQ (BX)(DI*1), DI
+	CMPL R8, $0x03
+	JA   match_emit_lits_copy_encodeBlockAsm
+	CMPL SI, $0x40
+	JB   match_emit_lits_copy_encodeBlockAsm
+	MOVL (DI), DI
+	CMPL SI, $0x0001003f
+	JBE  match_emit_copy2lits_encodeBlockAsm
+
+	// emitCopy3
+	LEAL -4(R11), R11
+	LEAL -65536(SI), SI
+	SHLL $0x0b, SI
+	LEAL 7(SI)(R8*8), SI
+	CMPL R11, $0x3c
+	JBE  emit_copy3_0_match_emit_lits_encodeBlockAsm
+	LEAL -60(R11), R9
+	CMPL R11, $0x0000013c
+	JB   emit_copy3_1_match_emit_lits_encodeBlockAsm
+	CMPL R11, $0x0001003c
+	JB   emit_copy3_2_match_emit_lits_encodeBlockAsm
+	ADDL $0x000007e0, SI
+	MOVL SI, (CX)
+	MOVL R9, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_emit_copy_litsencodeBlockAsm
+
+emit_copy3_2_match_emit_lits_encodeBlockAsm:
+	ADDL $0x000007c0, SI
+	MOVL SI, (CX)
+	MOVW R9, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_emit_copy_litsencodeBlockAsm
+
+emit_copy3_1_match_emit_lits_encodeBlockAsm:
+	ADDL $0x000007a0, SI
+	MOVL SI, (CX)
+	MOVB R9, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_emit_copy_litsencodeBlockAsm
+
+emit_copy3_0_match_emit_lits_encodeBlockAsm:
+	SHLL $0x05, R11
+	ORL  R11, SI
+	MOVL SI, (CX)
+	ADDQ $0x04, CX
+
+match_emit_copy_litsencodeBlockAsm:
+	MOVL DI, (CX)
+	ADDQ R8, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+match_emit_copy2lits_encodeBlockAsm:
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, SI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    SI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, SI
+	CMOVLLT R11, SI
+	LEAL    -1(R8)(SI*4), SI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(SI*8), SI
+	MOVB    SI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    DI, (CX)
+	ADDQ    R8, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBlockAsm
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBlockAsm
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBlockAsm
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_match_emit_repeat_copy2_encodeBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_match_emit_repeat_copy2_encodeBlockAsm:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_one_match_emit_repeat_copy2_encodeBlockAsm:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+match_emit_lits_copy_encodeBlockAsm:
+	LEAQ 4(CX)(R8*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_3
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_3:
+	// emitLiteral
+	LEAL -1(R8), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBlockAsm
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm
+	CMPL R9, $0x00010000
+	JB   three_bytes_match_emit_encodeBlockAsm
+	MOVL R9, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R9, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+three_bytes_match_emit_encodeBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+two_bytes_match_emit_encodeBlockAsm:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBlockAsm
+	JMP  memmove_long_match_emit_encodeBlockAsm
+
+one_byte_match_emit_encodeBlockAsm:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
+	MOVOU (DI), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm
+
+memmove_midmatch_emit_encodeBlockAsm:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBlockAsm
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_mid_end_copy_match_emit_encodeBlockAsm:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm
+
+memmove_long_match_emit_encodeBlockAsm:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R13*1), X4
+	MOVOU -16(DI)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  R9, CX
+
+match_nolits_copy_encodeBlockAsm:
+	// emitCopy
+	CMPL SI, $0x0001003f
+	JBE  two_byte_offset_match_nolit_encodeBlockAsm
+
+	// emitCopy3
+	LEAL -4(R11), R11
+	LEAL -65536(SI), SI
+	SHLL $0x0b, SI
+	ADDL $0x07, SI
+	CMPL R11, $0x3c
+	JBE  emit_copy3_0_match_nolit_encodeBlockAsm_emit3
+	LEAL -60(R11), DI
+	CMPL R11, $0x0000013c
+	JB   emit_copy3_1_match_nolit_encodeBlockAsm_emit3
+	CMPL R11, $0x0001003c
+	JB   emit_copy3_2_match_nolit_encodeBlockAsm_emit3
+	ADDL $0x000007e0, SI
+	MOVL SI, (CX)
+	MOVL DI, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy3_2_match_nolit_encodeBlockAsm_emit3:
+	ADDL $0x000007c0, SI
+	MOVL SI, (CX)
+	MOVW DI, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy3_1_match_nolit_encodeBlockAsm_emit3:
+	ADDL $0x000007a0, SI
+	MOVL SI, (CX)
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy3_0_match_nolit_encodeBlockAsm_emit3:
+	SHLL $0x05, R11
+	ORL  R11, SI
+	MOVL SI, (CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+two_byte_offset_match_nolit_encodeBlockAsm:
+	CMPL SI, $0x00000400
+	JA   two_byte_match_nolit_encodeBlockAsm
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBlockAsm
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_one_longer_match_nolit_encodeBlockAsm:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBlockAsm
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy1_repeat_match_nolit_encodeBlockAsm:
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+two_byte_match_nolit_encodeBlockAsm:
+	// emitCopy2
+	LEAL -64(SI), SI
+	LEAL -4(R11), R11
+	MOVW SI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBlockAsm_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBlockAsm_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBlockAsm_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy2_2_match_nolit_encodeBlockAsm_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy2_1_match_nolit_encodeBlockAsm_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm
+
+emit_copy2_0_match_nolit_encodeBlockAsm_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, R9
+	SHLQ  $0x10, R8
+	IMULQ SI, R8
+	SHRQ  $0x31, R8
+	SHLQ  $0x10, R9
+	IMULQ SI, R9
+	SHRQ  $0x31, R9
+	LEAL  -2(DX), R10
+	MOVL  (AX)(R9*4), SI
+	MOVL  R10, (AX)(R8*4)
+	MOVL  DX, (AX)(R9*4)
+	MOVL  DX, R8
+	INCL  DX
+	LEAL  -2162687(R8), R9
+	CMPL  SI, R9
+	JA    match_nolit_len_okencodeBlockAsm
+	JMP   search_loop_encodeBlockAsm
+
+match_nolit_len_okencodeBlockAsm:
+	CMPL (BX)(SI*1), DI
+	JNE  search_loop_encodeBlockAsm
+	MOVL R8, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	CMPQ CX, (SP)
+	JB   dst_size_check_ok_4
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_4:
+	ADDL $0x03, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit2_encodeBlockAsm
+
+matchlen_loopback_16_match_nolit2_encodeBlockAsm:
+	MOVQ (R8)(R11*1), R9
+	MOVQ 8(R8)(R11*1), R10
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm
+	XORQ 8(SI)(R11*1), R10
+	JNZ  matchlen_bsf_16match_nolit2_encodeBlockAsm
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit2_encodeBlockAsm:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit2_encodeBlockAsm
+	JMP  matchlen_match8_match_nolit2_encodeBlockAsm
+
+matchlen_bsf_16match_nolit2_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R11)(R10*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm
+
+matchlen_match8_match_nolit2_encodeBlockAsm:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit2_encodeBlockAsm
+	MOVQ (R8)(R11*1), R9
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit2_encodeBlockAsm
+
+matchlen_bsf_8_match_nolit2_encodeBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R11)(R9*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm
+
+matchlen_match4_match_nolit2_encodeBlockAsm:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit2_encodeBlockAsm
+	MOVL (R8)(R11*1), R9
+	CMPL (SI)(R11*1), R9
+	JNE  matchlen_match2_match_nolit2_encodeBlockAsm
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit2_encodeBlockAsm:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit2_encodeBlockAsm
+	JB   match_nolit2_end_encodeBlockAsm
+	MOVW (R8)(R11*1), R9
+	CMPW (SI)(R11*1), R9
+	JNE  matchlen_match1_match_nolit2_encodeBlockAsm
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit2_end_encodeBlockAsm
+
+matchlen_match1_match_nolit2_encodeBlockAsm:
+	MOVB (R8)(R11*1), R9
+	CMPB (SI)(R11*1), R9
+	JNE  match_nolit2_end_encodeBlockAsm
+	LEAL 1(R11), R11
+
+match_nolit2_end_encodeBlockAsm:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL DX, 12(SP)
+	MOVL 16(SP), SI
+	JMP  match_nolits_copy_encodeBlockAsm
+
+emit_remainder_encodeBlockAsm:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	SUBL DX, AX
+	JZ   emit_remainder_end_encodeBlockAsm
+	LEAQ (BX)(DX*1), DX
+	LEAQ 4(CX)(AX*1), BX
+	CMPQ BX, (SP)
+	JB   dst_size_check_ok_5
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_5:
+	// emitLiteral
+	LEAL -1(AX), BX
+	CMPL BX, $0x1d
+	JB   one_byte_emit_remainder_encodeBlockAsm
+	SUBL $0x1d, BX
+	CMPL BX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm
+	CMPL BX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBlockAsm
+	MOVL BX, SI
+	SHRL $0x10, SI
+	MOVB $0xf8, (CX)
+	MOVW BX, 1(CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, BX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+three_bytes_emit_remainder_encodeBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, BX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+two_bytes_emit_remainder_encodeBlockAsm:
+	MOVB $0xe8, (CX)
+	MOVB BL, 1(CX)
+	ADDL $0x1d, BX
+	ADDQ $0x02, CX
+	CMPL BX, $0x40
+	JB   memmove_midemit_remainder_encodeBlockAsm
+	JMP  memmove_long_emit_remainder_encodeBlockAsm
+
+one_byte_emit_remainder_encodeBlockAsm:
+	SHLB $0x03, BL
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 1
+	CMPQ AX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
+	CMPQ AX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through8
+	CMPQ AX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
+	MOVB (DX), SI
+	MOVB -1(DX)(AX*1), DL
+	MOVB SI, (CX)
+	MOVB DL, -1(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
+	MOVW (DX), SI
+	MOVB 2(DX), DL
+	MOVW SI, (CX)
+	MOVB DL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through8:
+	MOVL (DX), SI
+	MOVL -4(DX)(AX*1), DX
+	MOVL SI, (CX)
+	MOVL DX, -4(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
+	MOVQ (DX), SI
+	MOVQ -8(DX)(AX*1), DX
+	MOVQ SI, (CX)
+	MOVQ DX, -8(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm
+
+memmove_midemit_remainder_encodeBlockAsm:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 30
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBlockAsm
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBlockAsm:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm
+
+memmove_long_emit_remainder_encodeBlockAsm:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVQ  AX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R8*1), X4
+	MOVOU -16(DX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  AX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+	MOVQ  BX, CX
+
+emit_remainder_end_encodeBlockAsm:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBlockAsm2MB(dst []byte, src []byte, tmp *[131072]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBlockAsm2MB(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000400, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm2MB:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm2MB
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -17(DX), BX
+	LEAQ  -17(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm2MB:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm2MB
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x31, R10
+	SHLQ  $0x10, R11
+	IMULQ R9, R11
+	SHRQ  $0x31, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x31, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm2MB
+	LEAL  1(DX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeBlockAsm2MB
+
+repeat_extend_back_loop_encodeBlockAsm2MB:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeBlockAsm2MB
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm2MB
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeBlockAsm2MB
+
+repeat_extend_back_end_encodeBlockAsm2MB:
+	MOVL DI, SI
+	MOVL 12(SP), R8
+	SUBL R8, SI
+	LEAQ 4(CX)(SI*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_1
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_1:
+	LEAQ (BX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_repeat_emit_lits_encodeBlockAsm2MB
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_repeat_emit_lits_encodeBlockAsm2MB
+	CMPL R9, $0x00010000
+	JB   three_bytes_repeat_emit_lits_encodeBlockAsm2MB
+	MOVL R9, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R9, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm2MB
+
+three_bytes_repeat_emit_lits_encodeBlockAsm2MB:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm2MB
+
+two_bytes_repeat_emit_lits_encodeBlockAsm2MB:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midrepeat_emit_lits_encodeBlockAsm2MB
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm2MB
+
+one_byte_repeat_emit_lits_encodeBlockAsm2MB:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_8through16:
+	MOVOU (R8), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm2MB
+
+memmove_midrepeat_emit_lits_encodeBlockAsm2MB:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm2MB
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm2MB:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm2MB
+
+memmove_long_repeat_emit_lits_encodeBlockAsm2MB:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  SI, R12
+	JAE   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+repeat_emit_lits_end_encodeBlockAsm2MB:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBlockAsm2MB
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm2MB:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm2MB
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_repeat_extend_encodeBlockAsm2MB:
+	CMPL R8, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBlockAsm2MB
+	JMP  matchlen_match8_repeat_extend_encodeBlockAsm2MB
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm2MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm2MB
+
+matchlen_match8_repeat_extend_encodeBlockAsm2MB:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm2MB
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm2MB
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm2MB
+
+matchlen_match4_repeat_extend_encodeBlockAsm2MB:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm2MB
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm2MB
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm2MB:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm2MB
+	JB   repeat_extend_forward_end_encodeBlockAsm2MB
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm2MB
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm2MB
+
+matchlen_match1_repeat_extend_encodeBlockAsm2MB:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm2MB
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm2MB:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitRepeat
+	LEAL -1(SI), DI
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBlockAsm2MB
+	LEAL -30(SI), DI
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBlockAsm2MB
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBlockAsm2MB
+	MOVB $0xfc, (CX)
+	MOVL DI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm2MB
+
+repeat_three_match_repeat_encodeBlockAsm2MB:
+	MOVB $0xf4, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm2MB
+
+repeat_two_match_repeat_encodeBlockAsm2MB:
+	MOVB $0xec, (CX)
+	MOVB DI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm2MB
+
+repeat_one_match_repeat_encodeBlockAsm2MB:
+	XORL DI, DI
+	LEAL -4(DI)(SI*8), DI
+	MOVB DI, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBlockAsm2MB:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm2MB
+
+no_repeat_found_encodeBlockAsm2MB:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm2MB
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm2MB
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm2MB
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBlockAsm2MB
+
+candidate3_match_encodeBlockAsm2MB:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm2MB
+
+candidate2_match_encodeBlockAsm2MB:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm2MB:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm2MB
+
+match_extend_back_loop_encodeBlockAsm2MB:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm2MB
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm2MB
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm2MB
+	JMP  match_extend_back_loop_encodeBlockAsm2MB
+
+match_extend_back_end_encodeBlockAsm2MB:
+	CMPQ CX, (SP)
+	JB   dst_size_check_ok_2
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_2:
+	MOVL DX, R8
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBlockAsm2MB
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm2MB:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm2MB
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm2MB
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBlockAsm2MB:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBlockAsm2MB
+	JMP  matchlen_match8_match_nolit_encodeBlockAsm2MB
+
+matchlen_bsf_16match_nolit_encodeBlockAsm2MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBlockAsm2MB
+
+matchlen_match8_match_nolit_encodeBlockAsm2MB:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm2MB
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm2MB
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm2MB
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm2MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBlockAsm2MB
+
+matchlen_match4_match_nolit_encodeBlockAsm2MB:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm2MB
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm2MB
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBlockAsm2MB:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm2MB
+	JB   match_nolit_end_encodeBlockAsm2MB
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm2MB
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm2MB
+
+matchlen_match1_match_nolit_encodeBlockAsm2MB:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  match_nolit_end_encodeBlockAsm2MB
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBlockAsm2MB:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL 16(SP), SI
+	MOVL 12(SP), DI
+	MOVL DX, 12(SP)
+	SUBL DI, R8
+	JZ   match_nolits_copy_encodeBlockAsm2MB
+	LEAQ (BX)(DI*1), DI
+	CMPL R8, $0x03
+	JA   match_emit_lits_copy_encodeBlockAsm2MB
+	CMPL SI, $0x40
+	JB   match_emit_lits_copy_encodeBlockAsm2MB
+	MOVL (DI), DI
+	CMPL SI, $0x0001003f
+	JBE  match_emit_copy2lits_encodeBlockAsm2MB
+
+	// emitCopy3
+	LEAL -4(R11), R11
+	LEAL -65536(SI), SI
+	SHLL $0x0b, SI
+	LEAL 7(SI)(R8*8), SI
+	CMPL R11, $0x3c
+	JBE  emit_copy3_0_match_emit_lits_encodeBlockAsm2MB
+	LEAL -60(R11), R9
+	CMPL R11, $0x0000013c
+	JB   emit_copy3_1_match_emit_lits_encodeBlockAsm2MB
+	CMPL R11, $0x0001003c
+	JB   emit_copy3_2_match_emit_lits_encodeBlockAsm2MB
+	ADDL $0x000007e0, SI
+	MOVL SI, (CX)
+	MOVL R9, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_emit_copy_litsencodeBlockAsm2MB
+
+emit_copy3_2_match_emit_lits_encodeBlockAsm2MB:
+	ADDL $0x000007c0, SI
+	MOVL SI, (CX)
+	MOVW R9, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_emit_copy_litsencodeBlockAsm2MB
+
+emit_copy3_1_match_emit_lits_encodeBlockAsm2MB:
+	ADDL $0x000007a0, SI
+	MOVL SI, (CX)
+	MOVB R9, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_emit_copy_litsencodeBlockAsm2MB
+
+emit_copy3_0_match_emit_lits_encodeBlockAsm2MB:
+	SHLL $0x05, R11
+	ORL  R11, SI
+	MOVL SI, (CX)
+	ADDQ $0x04, CX
+
+match_emit_copy_litsencodeBlockAsm2MB:
+	MOVL DI, (CX)
+	ADDQ R8, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+match_emit_copy2lits_encodeBlockAsm2MB:
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, SI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    SI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, SI
+	CMOVLLT R11, SI
+	LEAL    -1(R8)(SI*4), SI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(SI*8), SI
+	MOVB    SI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    DI, (CX)
+	ADDQ    R8, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBlockAsm2MB
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBlockAsm2MB
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBlockAsm2MB
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+repeat_three_match_emit_repeat_copy2_encodeBlockAsm2MB:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+repeat_two_match_emit_repeat_copy2_encodeBlockAsm2MB:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+repeat_one_match_emit_repeat_copy2_encodeBlockAsm2MB:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+match_emit_lits_copy_encodeBlockAsm2MB:
+	LEAQ 4(CX)(R8*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_3
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_3:
+	// emitLiteral
+	LEAL -1(R8), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBlockAsm2MB
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm2MB
+	CMPL R9, $0x00010000
+	JB   three_bytes_match_emit_encodeBlockAsm2MB
+	MOVL R9, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R9, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBlockAsm2MB
+
+three_bytes_match_emit_encodeBlockAsm2MB:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBlockAsm2MB
+
+two_bytes_match_emit_encodeBlockAsm2MB:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBlockAsm2MB
+	JMP  memmove_long_match_emit_encodeBlockAsm2MB
+
+one_byte_match_emit_encodeBlockAsm2MB:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_8through16:
+	MOVOU (DI), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm2MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm2MB
+
+emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm2MB:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm2MB
+
+memmove_midmatch_emit_encodeBlockAsm2MB:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBlockAsm2MB
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm2MB_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_mid_end_copy_match_emit_encodeBlockAsm2MB:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm2MB
+
+memmove_long_match_emit_encodeBlockAsm2MB:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R13*1), X4
+	MOVOU -16(DI)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  R9, CX
+
+match_nolits_copy_encodeBlockAsm2MB:
+	// emitCopy
+	CMPL SI, $0x0001003f
+	JBE  two_byte_offset_match_nolit_encodeBlockAsm2MB
+
+	// emitCopy3
+	LEAL -4(R11), R11
+	LEAL -65536(SI), SI
+	SHLL $0x0b, SI
+	ADDL $0x07, SI
+	CMPL R11, $0x3c
+	JBE  emit_copy3_0_match_nolit_encodeBlockAsm2MB_emit3
+	LEAL -60(R11), DI
+	CMPL R11, $0x0000013c
+	JB   emit_copy3_1_match_nolit_encodeBlockAsm2MB_emit3
+	CMPL R11, $0x0001003c
+	JB   emit_copy3_2_match_nolit_encodeBlockAsm2MB_emit3
+	ADDL $0x000007e0, SI
+	MOVL SI, (CX)
+	MOVL DI, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+emit_copy3_2_match_nolit_encodeBlockAsm2MB_emit3:
+	ADDL $0x000007c0, SI
+	MOVL SI, (CX)
+	MOVW DI, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+emit_copy3_1_match_nolit_encodeBlockAsm2MB_emit3:
+	ADDL $0x000007a0, SI
+	MOVL SI, (CX)
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+emit_copy3_0_match_nolit_encodeBlockAsm2MB_emit3:
+	SHLL $0x05, R11
+	ORL  R11, SI
+	MOVL SI, (CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+two_byte_offset_match_nolit_encodeBlockAsm2MB:
+	CMPL SI, $0x00000400
+	JA   two_byte_match_nolit_encodeBlockAsm2MB
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBlockAsm2MB
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+emit_one_longer_match_nolit_encodeBlockAsm2MB:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBlockAsm2MB
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+emit_copy1_repeat_match_nolit_encodeBlockAsm2MB:
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm2MB:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+two_byte_match_nolit_encodeBlockAsm2MB:
+	// emitCopy2
+	LEAL -64(SI), SI
+	LEAL -4(R11), R11
+	MOVW SI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBlockAsm2MB_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBlockAsm2MB_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBlockAsm2MB_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+emit_copy2_2_match_nolit_encodeBlockAsm2MB_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+emit_copy2_1_match_nolit_encodeBlockAsm2MB_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm2MB
+
+emit_copy2_0_match_nolit_encodeBlockAsm2MB_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm2MB:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm2MB
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm2MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm2MB:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, R9
+	SHLQ  $0x10, R8
+	IMULQ SI, R8
+	SHRQ  $0x31, R8
+	SHLQ  $0x10, R9
+	IMULQ SI, R9
+	SHRQ  $0x31, R9
+	LEAL  -2(DX), R10
+	MOVL  (AX)(R9*4), SI
+	MOVL  R10, (AX)(R8*4)
+	MOVL  DX, (AX)(R9*4)
+	MOVL  DX, R8
+	INCL  DX
+	CMPL  (BX)(SI*1), DI
+	JNE   search_loop_encodeBlockAsm2MB
+	MOVL  R8, DI
+	SUBL  SI, DI
+	MOVL  DI, 16(SP)
+	CMPQ  CX, (SP)
+	JB    dst_size_check_ok_4
+	MOVQ  $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_4:
+	ADDL $0x03, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit2_encodeBlockAsm2MB
+
+matchlen_loopback_16_match_nolit2_encodeBlockAsm2MB:
+	MOVQ (R8)(R11*1), R9
+	MOVQ 8(R8)(R11*1), R10
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm2MB
+	XORQ 8(SI)(R11*1), R10
+	JNZ  matchlen_bsf_16match_nolit2_encodeBlockAsm2MB
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit2_encodeBlockAsm2MB:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit2_encodeBlockAsm2MB
+	JMP  matchlen_match8_match_nolit2_encodeBlockAsm2MB
+
+matchlen_bsf_16match_nolit2_encodeBlockAsm2MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R11)(R10*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm2MB
+
+matchlen_match8_match_nolit2_encodeBlockAsm2MB:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit2_encodeBlockAsm2MB
+	MOVQ (R8)(R11*1), R9
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm2MB
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit2_encodeBlockAsm2MB
+
+matchlen_bsf_8_match_nolit2_encodeBlockAsm2MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R11)(R9*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm2MB
+
+matchlen_match4_match_nolit2_encodeBlockAsm2MB:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit2_encodeBlockAsm2MB
+	MOVL (R8)(R11*1), R9
+	CMPL (SI)(R11*1), R9
+	JNE  matchlen_match2_match_nolit2_encodeBlockAsm2MB
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit2_encodeBlockAsm2MB:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit2_encodeBlockAsm2MB
+	JB   match_nolit2_end_encodeBlockAsm2MB
+	MOVW (R8)(R11*1), R9
+	CMPW (SI)(R11*1), R9
+	JNE  matchlen_match1_match_nolit2_encodeBlockAsm2MB
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit2_end_encodeBlockAsm2MB
+
+matchlen_match1_match_nolit2_encodeBlockAsm2MB:
+	MOVB (R8)(R11*1), R9
+	CMPB (SI)(R11*1), R9
+	JNE  match_nolit2_end_encodeBlockAsm2MB
+	LEAL 1(R11), R11
+
+match_nolit2_end_encodeBlockAsm2MB:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL DX, 12(SP)
+	MOVL 16(SP), SI
+	JMP  match_nolits_copy_encodeBlockAsm2MB
+
+emit_remainder_encodeBlockAsm2MB:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	SUBL DX, AX
+	JZ   emit_remainder_end_encodeBlockAsm2MB
+	LEAQ (BX)(DX*1), DX
+	LEAQ 4(CX)(AX*1), BX
+	CMPQ BX, (SP)
+	JB   dst_size_check_ok_5
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_5:
+	// emitLiteral
+	LEAL -1(AX), BX
+	CMPL BX, $0x1d
+	JB   one_byte_emit_remainder_encodeBlockAsm2MB
+	SUBL $0x1d, BX
+	CMPL BX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm2MB
+	CMPL BX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBlockAsm2MB
+	MOVL BX, SI
+	SHRL $0x10, SI
+	MOVB $0xf8, (CX)
+	MOVW BX, 1(CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, BX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm2MB
+
+three_bytes_emit_remainder_encodeBlockAsm2MB:
+	MOVB $0xf0, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, BX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm2MB
+
+two_bytes_emit_remainder_encodeBlockAsm2MB:
+	MOVB $0xe8, (CX)
+	MOVB BL, 1(CX)
+	ADDL $0x1d, BX
+	ADDQ $0x02, CX
+	CMPL BX, $0x40
+	JB   memmove_midemit_remainder_encodeBlockAsm2MB
+	JMP  memmove_long_emit_remainder_encodeBlockAsm2MB
+
+one_byte_emit_remainder_encodeBlockAsm2MB:
+	SHLB $0x03, BL
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 1
+	CMPQ AX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_3
+	CMPQ AX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_4through8
+	CMPQ AX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_8through16
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_1or2:
+	MOVB (DX), SI
+	MOVB -1(DX)(AX*1), DL
+	MOVB SI, (CX)
+	MOVB DL, -1(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm2MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_3:
+	MOVW (DX), SI
+	MOVB 2(DX), DL
+	MOVW SI, (CX)
+	MOVB DL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm2MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_4through8:
+	MOVL (DX), SI
+	MOVL -4(DX)(AX*1), DX
+	MOVL SI, (CX)
+	MOVL DX, -4(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm2MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_8through16:
+	MOVQ (DX), SI
+	MOVQ -8(DX)(AX*1), DX
+	MOVQ SI, (CX)
+	MOVQ DX, -8(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm2MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm2MB
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm2MB:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm2MB
+
+memmove_midemit_remainder_encodeBlockAsm2MB:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 30
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBlockAsm2MB
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm2MB_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBlockAsm2MB:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm2MB
+
+memmove_long_emit_remainder_encodeBlockAsm2MB:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVQ  AX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R8*1), X4
+	MOVOU -16(DX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  AX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+	MOVQ  BX, CX
+
+emit_remainder_end_encodeBlockAsm2MB:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBlockAsm512K(dst []byte, src []byte, tmp *[65536]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBlockAsm512K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000200, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm512K:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm512K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -17(DX), BX
+	LEAQ  -17(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm512K:
+	MOVL  DX, SI
+	SUBL  12(SP), SI
+	SHRL  $0x06, SI
+	LEAL  4(DX)(SI*1), SI
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBlockAsm512K
+	MOVQ  (BX)(DX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x0000cf1bbcdcbf9b, R9
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHRQ  $0x08, R11
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x10, R11
+	IMULQ R9, R11
+	SHRQ  $0x32, R11
+	MOVL  (AX)(R10*4), SI
+	MOVL  (AX)(R11*4), R8
+	MOVL  DX, (AX)(R10*4)
+	MOVL  DX, (AX)(R11*4)
+	MOVQ  DI, R10
+	SHRQ  $0x10, R10
+	SHLQ  $0x10, R10
+	IMULQ R9, R10
+	SHRQ  $0x32, R10
+	MOVL  DX, R9
+	SUBL  16(SP), R9
+	MOVL  1(BX)(R9*1), R11
+	MOVQ  DI, R9
+	SHRQ  $0x08, R9
+	CMPL  R9, R11
+	JNE   no_repeat_found_encodeBlockAsm512K
+	LEAL  1(DX), DI
+	MOVL  12(SP), SI
+	MOVL  DI, R8
+	SUBL  16(SP), R8
+	JZ    repeat_extend_back_end_encodeBlockAsm512K
+
+repeat_extend_back_loop_encodeBlockAsm512K:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeBlockAsm512K
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm512K
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeBlockAsm512K
+
+repeat_extend_back_end_encodeBlockAsm512K:
+	MOVL DI, SI
+	MOVL 12(SP), R8
+	SUBL R8, SI
+	LEAQ 4(CX)(SI*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_1
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_1:
+	LEAQ (BX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_repeat_emit_lits_encodeBlockAsm512K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_repeat_emit_lits_encodeBlockAsm512K
+	CMPL R9, $0x00010000
+	JB   three_bytes_repeat_emit_lits_encodeBlockAsm512K
+	MOVL R9, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R9, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm512K
+
+three_bytes_repeat_emit_lits_encodeBlockAsm512K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm512K
+
+two_bytes_repeat_emit_lits_encodeBlockAsm512K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midrepeat_emit_lits_encodeBlockAsm512K
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm512K
+
+one_byte_repeat_emit_lits_encodeBlockAsm512K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_8through16:
+	MOVOU (R8), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm512K
+
+memmove_midrepeat_emit_lits_encodeBlockAsm512K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm512K
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm512K:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm512K
+
+memmove_long_repeat_emit_lits_encodeBlockAsm512K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  SI, R12
+	JAE   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+repeat_emit_lits_end_encodeBlockAsm512K:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBlockAsm512K
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm512K:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm512K
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm512K
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_repeat_extend_encodeBlockAsm512K:
+	CMPL R8, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBlockAsm512K
+	JMP  matchlen_match8_repeat_extend_encodeBlockAsm512K
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm512K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm512K
+
+matchlen_match8_repeat_extend_encodeBlockAsm512K:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm512K
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm512K
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm512K
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm512K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm512K
+
+matchlen_match4_repeat_extend_encodeBlockAsm512K:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm512K
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm512K
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm512K:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm512K
+	JB   repeat_extend_forward_end_encodeBlockAsm512K
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm512K
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm512K
+
+matchlen_match1_repeat_extend_encodeBlockAsm512K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm512K
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm512K:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitRepeat
+	LEAL -1(SI), DI
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBlockAsm512K
+	LEAL -30(SI), DI
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBlockAsm512K
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBlockAsm512K
+	MOVB $0xfc, (CX)
+	MOVL DI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm512K
+
+repeat_three_match_repeat_encodeBlockAsm512K:
+	MOVB $0xf4, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm512K
+
+repeat_two_match_repeat_encodeBlockAsm512K:
+	MOVB $0xec, (CX)
+	MOVB DI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm512K
+
+repeat_one_match_repeat_encodeBlockAsm512K:
+	XORL DI, DI
+	LEAL -4(DI)(SI*8), DI
+	MOVB DI, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBlockAsm512K:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm512K
+
+no_repeat_found_encodeBlockAsm512K:
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate_match_encodeBlockAsm512K
+	SHRQ $0x08, DI
+	MOVL (AX)(R10*4), SI
+	LEAL 2(DX), R9
+	CMPL (BX)(R8*1), DI
+	JEQ  candidate2_match_encodeBlockAsm512K
+	MOVL R9, (AX)(R10*4)
+	SHRQ $0x08, DI
+	CMPL (BX)(SI*1), DI
+	JEQ  candidate3_match_encodeBlockAsm512K
+	MOVL 20(SP), DX
+	JMP  search_loop_encodeBlockAsm512K
+
+candidate3_match_encodeBlockAsm512K:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm512K
+
+candidate2_match_encodeBlockAsm512K:
+	MOVL R9, (AX)(R10*4)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm512K:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm512K
+
+match_extend_back_loop_encodeBlockAsm512K:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm512K
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm512K
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm512K
+	JMP  match_extend_back_loop_encodeBlockAsm512K
+
+match_extend_back_end_encodeBlockAsm512K:
+	CMPQ CX, (SP)
+	JB   dst_size_check_ok_2
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_2:
+	MOVL DX, R8
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBlockAsm512K
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm512K:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm512K
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm512K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBlockAsm512K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBlockAsm512K
+	JMP  matchlen_match8_match_nolit_encodeBlockAsm512K
+
+matchlen_bsf_16match_nolit_encodeBlockAsm512K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBlockAsm512K
+
+matchlen_match8_match_nolit_encodeBlockAsm512K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm512K
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm512K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm512K
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm512K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBlockAsm512K
+
+matchlen_match4_match_nolit_encodeBlockAsm512K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm512K
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm512K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBlockAsm512K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm512K
+	JB   match_nolit_end_encodeBlockAsm512K
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm512K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm512K
+
+matchlen_match1_match_nolit_encodeBlockAsm512K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  match_nolit_end_encodeBlockAsm512K
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBlockAsm512K:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL 16(SP), SI
+	MOVL 12(SP), DI
+	MOVL DX, 12(SP)
+	SUBL DI, R8
+	JZ   match_nolits_copy_encodeBlockAsm512K
+	LEAQ (BX)(DI*1), DI
+	CMPL R8, $0x03
+	JA   match_emit_lits_copy_encodeBlockAsm512K
+	CMPL SI, $0x40
+	JB   match_emit_lits_copy_encodeBlockAsm512K
+	MOVL (DI), DI
+	CMPL SI, $0x0001003f
+	JBE  match_emit_copy2lits_encodeBlockAsm512K
+
+	// emitCopy3
+	LEAL -4(R11), R11
+	LEAL -65536(SI), SI
+	SHLL $0x0b, SI
+	LEAL 7(SI)(R8*8), SI
+	CMPL R11, $0x3c
+	JBE  emit_copy3_0_match_emit_lits_encodeBlockAsm512K
+	LEAL -60(R11), R9
+	CMPL R11, $0x0000013c
+	JB   emit_copy3_1_match_emit_lits_encodeBlockAsm512K
+	CMPL R11, $0x0001003c
+	JB   emit_copy3_2_match_emit_lits_encodeBlockAsm512K
+	ADDL $0x000007e0, SI
+	MOVL SI, (CX)
+	MOVL R9, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_emit_copy_litsencodeBlockAsm512K
+
+emit_copy3_2_match_emit_lits_encodeBlockAsm512K:
+	ADDL $0x000007c0, SI
+	MOVL SI, (CX)
+	MOVW R9, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_emit_copy_litsencodeBlockAsm512K
+
+emit_copy3_1_match_emit_lits_encodeBlockAsm512K:
+	ADDL $0x000007a0, SI
+	MOVL SI, (CX)
+	MOVB R9, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_emit_copy_litsencodeBlockAsm512K
+
+emit_copy3_0_match_emit_lits_encodeBlockAsm512K:
+	SHLL $0x05, R11
+	ORL  R11, SI
+	MOVL SI, (CX)
+	ADDQ $0x04, CX
+
+match_emit_copy_litsencodeBlockAsm512K:
+	MOVL DI, (CX)
+	ADDQ R8, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+match_emit_copy2lits_encodeBlockAsm512K:
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, SI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    SI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, SI
+	CMOVLLT R11, SI
+	LEAL    -1(R8)(SI*4), SI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(SI*8), SI
+	MOVB    SI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    DI, (CX)
+	ADDQ    R8, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBlockAsm512K
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBlockAsm512K
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBlockAsm512K
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBlockAsm512K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+repeat_three_match_emit_repeat_copy2_encodeBlockAsm512K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+repeat_two_match_emit_repeat_copy2_encodeBlockAsm512K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+repeat_one_match_emit_repeat_copy2_encodeBlockAsm512K:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+match_emit_lits_copy_encodeBlockAsm512K:
+	LEAQ 4(CX)(R8*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_3
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_3:
+	// emitLiteral
+	LEAL -1(R8), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBlockAsm512K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm512K
+	CMPL R9, $0x00010000
+	JB   three_bytes_match_emit_encodeBlockAsm512K
+	MOVL R9, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R9, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBlockAsm512K
+
+three_bytes_match_emit_encodeBlockAsm512K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBlockAsm512K
+
+two_bytes_match_emit_encodeBlockAsm512K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBlockAsm512K
+	JMP  memmove_long_match_emit_encodeBlockAsm512K
+
+one_byte_match_emit_encodeBlockAsm512K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_8through16:
+	MOVOU (DI), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm512K
+
+emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm512K
+
+emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm512K:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm512K
+
+memmove_midmatch_emit_encodeBlockAsm512K:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBlockAsm512K
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm512K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_mid_end_copy_match_emit_encodeBlockAsm512K:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm512K
+
+memmove_long_match_emit_encodeBlockAsm512K:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R13*1), X4
+	MOVOU -16(DI)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  R9, CX
+
+match_nolits_copy_encodeBlockAsm512K:
+	// emitCopy
+	CMPL SI, $0x0001003f
+	JBE  two_byte_offset_match_nolit_encodeBlockAsm512K
+
+	// emitCopy3
+	LEAL -4(R11), R11
+	LEAL -65536(SI), SI
+	SHLL $0x0b, SI
+	ADDL $0x07, SI
+	CMPL R11, $0x3c
+	JBE  emit_copy3_0_match_nolit_encodeBlockAsm512K_emit3
+	LEAL -60(R11), DI
+	CMPL R11, $0x0000013c
+	JB   emit_copy3_1_match_nolit_encodeBlockAsm512K_emit3
+	CMPL R11, $0x0001003c
+	JB   emit_copy3_2_match_nolit_encodeBlockAsm512K_emit3
+	ADDL $0x000007e0, SI
+	MOVL SI, (CX)
+	MOVL DI, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+emit_copy3_2_match_nolit_encodeBlockAsm512K_emit3:
+	ADDL $0x000007c0, SI
+	MOVL SI, (CX)
+	MOVW DI, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+emit_copy3_1_match_nolit_encodeBlockAsm512K_emit3:
+	ADDL $0x000007a0, SI
+	MOVL SI, (CX)
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+emit_copy3_0_match_nolit_encodeBlockAsm512K_emit3:
+	SHLL $0x05, R11
+	ORL  R11, SI
+	MOVL SI, (CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+two_byte_offset_match_nolit_encodeBlockAsm512K:
+	CMPL SI, $0x00000400
+	JA   two_byte_match_nolit_encodeBlockAsm512K
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBlockAsm512K
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+emit_one_longer_match_nolit_encodeBlockAsm512K:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBlockAsm512K
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+emit_copy1_repeat_match_nolit_encodeBlockAsm512K:
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm512K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+two_byte_match_nolit_encodeBlockAsm512K:
+	// emitCopy2
+	LEAL -64(SI), SI
+	LEAL -4(R11), R11
+	MOVW SI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBlockAsm512K_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBlockAsm512K_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBlockAsm512K_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+emit_copy2_2_match_nolit_encodeBlockAsm512K_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+emit_copy2_1_match_nolit_encodeBlockAsm512K_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm512K
+
+emit_copy2_0_match_nolit_encodeBlockAsm512K_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm512K:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm512K
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm512K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm512K:
+	MOVQ  $0x0000cf1bbcdcbf9b, SI
+	MOVQ  DI, R8
+	SHRQ  $0x10, DI
+	MOVQ  DI, R9
+	SHLQ  $0x10, R8
+	IMULQ SI, R8
+	SHRQ  $0x32, R8
+	SHLQ  $0x10, R9
+	IMULQ SI, R9
+	SHRQ  $0x32, R9
+	LEAL  -2(DX), R10
+	MOVL  (AX)(R9*4), SI
+	MOVL  R10, (AX)(R8*4)
+	MOVL  DX, (AX)(R9*4)
+	MOVL  DX, R8
+	INCL  DX
+	CMPL  (BX)(SI*1), DI
+	JNE   search_loop_encodeBlockAsm512K
+	MOVL  R8, DI
+	SUBL  SI, DI
+	MOVL  DI, 16(SP)
+	CMPQ  CX, (SP)
+	JB    dst_size_check_ok_4
+	MOVQ  $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_4:
+	ADDL $0x03, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit2_encodeBlockAsm512K
+
+matchlen_loopback_16_match_nolit2_encodeBlockAsm512K:
+	MOVQ (R8)(R11*1), R9
+	MOVQ 8(R8)(R11*1), R10
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm512K
+	XORQ 8(SI)(R11*1), R10
+	JNZ  matchlen_bsf_16match_nolit2_encodeBlockAsm512K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit2_encodeBlockAsm512K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit2_encodeBlockAsm512K
+	JMP  matchlen_match8_match_nolit2_encodeBlockAsm512K
+
+matchlen_bsf_16match_nolit2_encodeBlockAsm512K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R11)(R10*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm512K
+
+matchlen_match8_match_nolit2_encodeBlockAsm512K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit2_encodeBlockAsm512K
+	MOVQ (R8)(R11*1), R9
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm512K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit2_encodeBlockAsm512K
+
+matchlen_bsf_8_match_nolit2_encodeBlockAsm512K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R11)(R9*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm512K
+
+matchlen_match4_match_nolit2_encodeBlockAsm512K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit2_encodeBlockAsm512K
+	MOVL (R8)(R11*1), R9
+	CMPL (SI)(R11*1), R9
+	JNE  matchlen_match2_match_nolit2_encodeBlockAsm512K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit2_encodeBlockAsm512K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit2_encodeBlockAsm512K
+	JB   match_nolit2_end_encodeBlockAsm512K
+	MOVW (R8)(R11*1), R9
+	CMPW (SI)(R11*1), R9
+	JNE  matchlen_match1_match_nolit2_encodeBlockAsm512K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit2_end_encodeBlockAsm512K
+
+matchlen_match1_match_nolit2_encodeBlockAsm512K:
+	MOVB (R8)(R11*1), R9
+	CMPB (SI)(R11*1), R9
+	JNE  match_nolit2_end_encodeBlockAsm512K
+	LEAL 1(R11), R11
+
+match_nolit2_end_encodeBlockAsm512K:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL DX, 12(SP)
+	MOVL 16(SP), SI
+	JMP  match_nolits_copy_encodeBlockAsm512K
+
+emit_remainder_encodeBlockAsm512K:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	SUBL DX, AX
+	JZ   emit_remainder_end_encodeBlockAsm512K
+	LEAQ (BX)(DX*1), DX
+	LEAQ 4(CX)(AX*1), BX
+	CMPQ BX, (SP)
+	JB   dst_size_check_ok_5
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_5:
+	// emitLiteral
+	LEAL -1(AX), BX
+	CMPL BX, $0x1d
+	JB   one_byte_emit_remainder_encodeBlockAsm512K
+	SUBL $0x1d, BX
+	CMPL BX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm512K
+	CMPL BX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBlockAsm512K
+	MOVL BX, SI
+	SHRL $0x10, SI
+	MOVB $0xf8, (CX)
+	MOVW BX, 1(CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, BX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm512K
+
+three_bytes_emit_remainder_encodeBlockAsm512K:
+	MOVB $0xf0, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, BX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm512K
+
+two_bytes_emit_remainder_encodeBlockAsm512K:
+	MOVB $0xe8, (CX)
+	MOVB BL, 1(CX)
+	ADDL $0x1d, BX
+	ADDQ $0x02, CX
+	CMPL BX, $0x40
+	JB   memmove_midemit_remainder_encodeBlockAsm512K
+	JMP  memmove_long_emit_remainder_encodeBlockAsm512K
+
+one_byte_emit_remainder_encodeBlockAsm512K:
+	SHLB $0x03, BL
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 1
+	CMPQ AX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_3
+	CMPQ AX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_4through8
+	CMPQ AX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_8through16
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_1or2:
+	MOVB (DX), SI
+	MOVB -1(DX)(AX*1), DL
+	MOVB SI, (CX)
+	MOVB DL, -1(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm512K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_3:
+	MOVW (DX), SI
+	MOVB 2(DX), DL
+	MOVW SI, (CX)
+	MOVB DL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm512K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_4through8:
+	MOVL (DX), SI
+	MOVL -4(DX)(AX*1), DX
+	MOVL SI, (CX)
+	MOVL DX, -4(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm512K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_8through16:
+	MOVQ (DX), SI
+	MOVQ -8(DX)(AX*1), DX
+	MOVQ SI, (CX)
+	MOVQ DX, -8(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm512K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm512K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm512K:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm512K
+
+memmove_midemit_remainder_encodeBlockAsm512K:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 30
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBlockAsm512K
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm512K_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBlockAsm512K:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm512K
+
+memmove_long_emit_remainder_encodeBlockAsm512K:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVQ  AX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R8*1), X4
+	MOVOU -16(DX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  AX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+	MOVQ  BX, CX
+
+emit_remainder_end_encodeBlockAsm512K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBlockAsm64K(dst []byte, src []byte, tmp *[16384]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBlockAsm64K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000080, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm64K:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm64K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -17(DX), BX
+	LEAQ  -17(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm64K:
+	MOVL    DX, SI
+	SUBL    12(SP), SI
+	SHRL    $0x05, SI
+	LEAL    4(DX)(SI*1), SI
+	CMPL    SI, 8(SP)
+	JAE     emit_remainder_encodeBlockAsm64K
+	MOVQ    (BX)(DX*1), DI
+	MOVL    SI, 20(SP)
+	MOVQ    $0x0000cf1bbcdcbf9b, R9
+	MOVQ    DI, R10
+	MOVQ    DI, R11
+	SHRQ    $0x08, R11
+	SHLQ    $0x10, R10
+	IMULQ   R9, R10
+	SHRQ    $0x33, R10
+	SHLQ    $0x10, R11
+	IMULQ   R9, R11
+	SHRQ    $0x33, R11
+	MOVWLZX (AX)(R10*2), SI
+	MOVWLZX (AX)(R11*2), R8
+	MOVW    DX, (AX)(R10*2)
+	MOVW    DX, (AX)(R11*2)
+	MOVQ    DI, R10
+	SHRQ    $0x10, R10
+	SHLQ    $0x10, R10
+	IMULQ   R9, R10
+	SHRQ    $0x33, R10
+	MOVL    DX, R9
+	SUBL    16(SP), R9
+	MOVL    1(BX)(R9*1), R11
+	MOVQ    DI, R9
+	SHRQ    $0x08, R9
+	CMPL    R9, R11
+	JNE     no_repeat_found_encodeBlockAsm64K
+	LEAL    1(DX), DI
+	MOVL    12(SP), SI
+	MOVL    DI, R8
+	SUBL    16(SP), R8
+	JZ      repeat_extend_back_end_encodeBlockAsm64K
+
+repeat_extend_back_loop_encodeBlockAsm64K:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeBlockAsm64K
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm64K
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeBlockAsm64K
+
+repeat_extend_back_end_encodeBlockAsm64K:
+	MOVL DI, SI
+	MOVL 12(SP), R8
+	SUBL R8, SI
+	LEAQ 4(CX)(SI*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_1
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_1:
+	LEAQ (BX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_repeat_emit_lits_encodeBlockAsm64K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_repeat_emit_lits_encodeBlockAsm64K
+	JB   three_bytes_repeat_emit_lits_encodeBlockAsm64K
+	MOVL R9, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R9, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm64K
+
+three_bytes_repeat_emit_lits_encodeBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm64K
+
+two_bytes_repeat_emit_lits_encodeBlockAsm64K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midrepeat_emit_lits_encodeBlockAsm64K
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm64K
+
+one_byte_repeat_emit_lits_encodeBlockAsm64K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_8through16:
+	MOVOU (R8), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm64K
+
+memmove_midrepeat_emit_lits_encodeBlockAsm64K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm64K
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm64K:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm64K
+
+memmove_long_repeat_emit_lits_encodeBlockAsm64K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  SI, R12
+	JAE   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+repeat_emit_lits_end_encodeBlockAsm64K:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBlockAsm64K
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm64K:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm64K
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm64K
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_repeat_extend_encodeBlockAsm64K:
+	CMPL R8, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBlockAsm64K
+	JMP  matchlen_match8_repeat_extend_encodeBlockAsm64K
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm64K
+
+matchlen_match8_repeat_extend_encodeBlockAsm64K:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm64K
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm64K
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm64K
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm64K
+
+matchlen_match4_repeat_extend_encodeBlockAsm64K:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm64K
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm64K
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm64K:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm64K
+	JB   repeat_extend_forward_end_encodeBlockAsm64K
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm64K
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm64K
+
+matchlen_match1_repeat_extend_encodeBlockAsm64K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm64K
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm64K:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitRepeat
+	LEAL -1(SI), DI
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBlockAsm64K
+	LEAL -30(SI), DI
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBlockAsm64K
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBlockAsm64K
+	MOVB $0xfc, (CX)
+	MOVL DI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm64K
+
+repeat_three_match_repeat_encodeBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm64K
+
+repeat_two_match_repeat_encodeBlockAsm64K:
+	MOVB $0xec, (CX)
+	MOVB DI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm64K
+
+repeat_one_match_repeat_encodeBlockAsm64K:
+	XORL DI, DI
+	LEAL -4(DI)(SI*8), DI
+	MOVB DI, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBlockAsm64K:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm64K
+
+no_repeat_found_encodeBlockAsm64K:
+	CMPL    (BX)(SI*1), DI
+	JEQ     candidate_match_encodeBlockAsm64K
+	SHRQ    $0x08, DI
+	MOVWLZX (AX)(R10*2), SI
+	LEAL    2(DX), R9
+	CMPL    (BX)(R8*1), DI
+	JEQ     candidate2_match_encodeBlockAsm64K
+	MOVW    R9, (AX)(R10*2)
+	SHRQ    $0x08, DI
+	CMPL    (BX)(SI*1), DI
+	JEQ     candidate3_match_encodeBlockAsm64K
+	MOVL    20(SP), DX
+	JMP     search_loop_encodeBlockAsm64K
+
+candidate3_match_encodeBlockAsm64K:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm64K
+
+candidate2_match_encodeBlockAsm64K:
+	MOVW R9, (AX)(R10*2)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm64K:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm64K
+
+match_extend_back_loop_encodeBlockAsm64K:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm64K
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm64K
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm64K
+	JMP  match_extend_back_loop_encodeBlockAsm64K
+
+match_extend_back_end_encodeBlockAsm64K:
+	CMPQ CX, (SP)
+	JB   dst_size_check_ok_2
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_2:
+	MOVL DX, R8
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBlockAsm64K
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm64K:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm64K
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm64K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBlockAsm64K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBlockAsm64K
+	JMP  matchlen_match8_match_nolit_encodeBlockAsm64K
+
+matchlen_bsf_16match_nolit_encodeBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBlockAsm64K
+
+matchlen_match8_match_nolit_encodeBlockAsm64K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm64K
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm64K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm64K
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBlockAsm64K
+
+matchlen_match4_match_nolit_encodeBlockAsm64K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm64K
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm64K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBlockAsm64K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm64K
+	JB   match_nolit_end_encodeBlockAsm64K
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm64K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm64K
+
+matchlen_match1_match_nolit_encodeBlockAsm64K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  match_nolit_end_encodeBlockAsm64K
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBlockAsm64K:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL 16(SP), SI
+	MOVL 12(SP), DI
+	MOVL DX, 12(SP)
+	SUBL DI, R8
+	JZ   match_nolits_copy_encodeBlockAsm64K
+	LEAQ (BX)(DI*1), DI
+	CMPL R8, $0x03
+	JA   match_emit_lits_copy_encodeBlockAsm64K
+	CMPL SI, $0x40
+	JB   match_emit_lits_copy_encodeBlockAsm64K
+	MOVL (DI), DI
+
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, SI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    SI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, SI
+	CMOVLLT R11, SI
+	LEAL    -1(R8)(SI*4), SI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(SI*8), SI
+	MOVB    SI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    DI, (CX)
+	ADDQ    R8, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBlockAsm64K
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBlockAsm64K
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBlockAsm64K
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBlockAsm64K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+repeat_three_match_emit_repeat_copy2_encodeBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+repeat_two_match_emit_repeat_copy2_encodeBlockAsm64K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+repeat_one_match_emit_repeat_copy2_encodeBlockAsm64K:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+match_emit_lits_copy_encodeBlockAsm64K:
+	LEAQ 4(CX)(R8*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_3
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_3:
+	// emitLiteral
+	LEAL -1(R8), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBlockAsm64K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm64K
+	JB   three_bytes_match_emit_encodeBlockAsm64K
+	MOVL R9, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R9, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBlockAsm64K
+
+three_bytes_match_emit_encodeBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBlockAsm64K
+
+two_bytes_match_emit_encodeBlockAsm64K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBlockAsm64K
+	JMP  memmove_long_match_emit_encodeBlockAsm64K
+
+one_byte_match_emit_encodeBlockAsm64K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_8through16:
+	MOVOU (DI), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm64K:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm64K
+
+memmove_midmatch_emit_encodeBlockAsm64K:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBlockAsm64K
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm64K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_mid_end_copy_match_emit_encodeBlockAsm64K:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm64K
+
+memmove_long_match_emit_encodeBlockAsm64K:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R13*1), X4
+	MOVOU -16(DI)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  R9, CX
+
+match_nolits_copy_encodeBlockAsm64K:
+	// emitCopy
+	CMPL SI, $0x00000400
+	JA   two_byte_match_nolit_encodeBlockAsm64K
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBlockAsm64K
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+emit_one_longer_match_nolit_encodeBlockAsm64K:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBlockAsm64K
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+emit_copy1_repeat_match_nolit_encodeBlockAsm64K:
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm64K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+two_byte_match_nolit_encodeBlockAsm64K:
+	// emitCopy2
+	LEAL -64(SI), SI
+	LEAL -4(R11), R11
+	MOVW SI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBlockAsm64K_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBlockAsm64K_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBlockAsm64K_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+emit_copy2_2_match_nolit_encodeBlockAsm64K_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+emit_copy2_1_match_nolit_encodeBlockAsm64K_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm64K
+
+emit_copy2_0_match_nolit_encodeBlockAsm64K_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm64K:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm64K
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm64K:
+	MOVQ    $0x0000cf1bbcdcbf9b, SI
+	MOVQ    DI, R8
+	SHRQ    $0x10, DI
+	MOVQ    DI, R9
+	SHLQ    $0x10, R8
+	IMULQ   SI, R8
+	SHRQ    $0x33, R8
+	SHLQ    $0x10, R9
+	IMULQ   SI, R9
+	SHRQ    $0x33, R9
+	LEAL    -2(DX), R10
+	MOVWLZX (AX)(R9*2), SI
+	MOVW    R10, (AX)(R8*2)
+	MOVW    DX, (AX)(R9*2)
+	MOVL    DX, R8
+	INCL    DX
+	CMPL    (BX)(SI*1), DI
+	JNE     search_loop_encodeBlockAsm64K
+	MOVL    R8, DI
+	SUBL    SI, DI
+	MOVL    DI, 16(SP)
+	CMPQ    CX, (SP)
+	JB      dst_size_check_ok_4
+	MOVQ    $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_4:
+	ADDL $0x03, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit2_encodeBlockAsm64K
+
+matchlen_loopback_16_match_nolit2_encodeBlockAsm64K:
+	MOVQ (R8)(R11*1), R9
+	MOVQ 8(R8)(R11*1), R10
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm64K
+	XORQ 8(SI)(R11*1), R10
+	JNZ  matchlen_bsf_16match_nolit2_encodeBlockAsm64K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit2_encodeBlockAsm64K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit2_encodeBlockAsm64K
+	JMP  matchlen_match8_match_nolit2_encodeBlockAsm64K
+
+matchlen_bsf_16match_nolit2_encodeBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R11)(R10*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm64K
+
+matchlen_match8_match_nolit2_encodeBlockAsm64K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit2_encodeBlockAsm64K
+	MOVQ (R8)(R11*1), R9
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm64K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit2_encodeBlockAsm64K
+
+matchlen_bsf_8_match_nolit2_encodeBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R11)(R9*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm64K
+
+matchlen_match4_match_nolit2_encodeBlockAsm64K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit2_encodeBlockAsm64K
+	MOVL (R8)(R11*1), R9
+	CMPL (SI)(R11*1), R9
+	JNE  matchlen_match2_match_nolit2_encodeBlockAsm64K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit2_encodeBlockAsm64K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit2_encodeBlockAsm64K
+	JB   match_nolit2_end_encodeBlockAsm64K
+	MOVW (R8)(R11*1), R9
+	CMPW (SI)(R11*1), R9
+	JNE  matchlen_match1_match_nolit2_encodeBlockAsm64K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit2_end_encodeBlockAsm64K
+
+matchlen_match1_match_nolit2_encodeBlockAsm64K:
+	MOVB (R8)(R11*1), R9
+	CMPB (SI)(R11*1), R9
+	JNE  match_nolit2_end_encodeBlockAsm64K
+	LEAL 1(R11), R11
+
+match_nolit2_end_encodeBlockAsm64K:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL DX, 12(SP)
+	MOVL 16(SP), SI
+	JMP  match_nolits_copy_encodeBlockAsm64K
+
+emit_remainder_encodeBlockAsm64K:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	SUBL DX, AX
+	JZ   emit_remainder_end_encodeBlockAsm64K
+	LEAQ (BX)(DX*1), DX
+	LEAQ 4(CX)(AX*1), BX
+	CMPQ BX, (SP)
+	JB   dst_size_check_ok_5
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_5:
+	// emitLiteral
+	LEAL -1(AX), BX
+	CMPL BX, $0x1d
+	JB   one_byte_emit_remainder_encodeBlockAsm64K
+	SUBL $0x1d, BX
+	CMPL BX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm64K
+	JB   three_bytes_emit_remainder_encodeBlockAsm64K
+	MOVL BX, SI
+	SHRL $0x10, SI
+	MOVB $0xf8, (CX)
+	MOVW BX, 1(CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, BX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm64K
+
+three_bytes_emit_remainder_encodeBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, BX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm64K
+
+two_bytes_emit_remainder_encodeBlockAsm64K:
+	MOVB $0xe8, (CX)
+	MOVB BL, 1(CX)
+	ADDL $0x1d, BX
+	ADDQ $0x02, CX
+	CMPL BX, $0x40
+	JB   memmove_midemit_remainder_encodeBlockAsm64K
+	JMP  memmove_long_emit_remainder_encodeBlockAsm64K
+
+one_byte_emit_remainder_encodeBlockAsm64K:
+	SHLB $0x03, BL
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 1
+	CMPQ AX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_3
+	CMPQ AX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_4through8
+	CMPQ AX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_8through16
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_1or2:
+	MOVB (DX), SI
+	MOVB -1(DX)(AX*1), DL
+	MOVB SI, (CX)
+	MOVB DL, -1(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_3:
+	MOVW (DX), SI
+	MOVB 2(DX), DL
+	MOVW SI, (CX)
+	MOVB DL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_4through8:
+	MOVL (DX), SI
+	MOVL -4(DX)(AX*1), DX
+	MOVL SI, (CX)
+	MOVL DX, -4(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_8through16:
+	MOVQ (DX), SI
+	MOVQ -8(DX)(AX*1), DX
+	MOVQ SI, (CX)
+	MOVQ DX, -8(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm64K:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm64K
+
+memmove_midemit_remainder_encodeBlockAsm64K:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 30
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBlockAsm64K
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm64K_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBlockAsm64K:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm64K
+
+memmove_long_emit_remainder_encodeBlockAsm64K:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVQ  AX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R8*1), X4
+	MOVOU -16(DX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  AX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+	MOVQ  BX, CX
+
+emit_remainder_end_encodeBlockAsm64K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBlockAsm16K(dst []byte, src []byte, tmp *[8192]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBlockAsm16K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000040, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm16K:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm16K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -17(DX), BX
+	LEAQ  -17(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm16K:
+	MOVL    DX, SI
+	SUBL    12(SP), SI
+	SHRL    $0x05, SI
+	LEAL    4(DX)(SI*1), SI
+	CMPL    SI, 8(SP)
+	JAE     emit_remainder_encodeBlockAsm16K
+	MOVQ    (BX)(DX*1), DI
+	MOVL    SI, 20(SP)
+	MOVQ    $0x000000cf1bbcdcbb, R9
+	MOVQ    DI, R10
+	MOVQ    DI, R11
+	SHRQ    $0x08, R11
+	SHLQ    $0x18, R10
+	IMULQ   R9, R10
+	SHRQ    $0x34, R10
+	SHLQ    $0x18, R11
+	IMULQ   R9, R11
+	SHRQ    $0x34, R11
+	MOVWLZX (AX)(R10*2), SI
+	MOVWLZX (AX)(R11*2), R8
+	MOVW    DX, (AX)(R10*2)
+	MOVW    DX, (AX)(R11*2)
+	MOVQ    DI, R10
+	SHRQ    $0x10, R10
+	SHLQ    $0x18, R10
+	IMULQ   R9, R10
+	SHRQ    $0x34, R10
+	MOVL    DX, R9
+	SUBL    16(SP), R9
+	MOVL    1(BX)(R9*1), R11
+	MOVQ    DI, R9
+	SHRQ    $0x08, R9
+	CMPL    R9, R11
+	JNE     no_repeat_found_encodeBlockAsm16K
+	LEAL    1(DX), DI
+	MOVL    12(SP), SI
+	MOVL    DI, R8
+	SUBL    16(SP), R8
+	JZ      repeat_extend_back_end_encodeBlockAsm16K
+
+repeat_extend_back_loop_encodeBlockAsm16K:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeBlockAsm16K
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm16K
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeBlockAsm16K
+
+repeat_extend_back_end_encodeBlockAsm16K:
+	MOVL DI, SI
+	MOVL 12(SP), R8
+	SUBL R8, SI
+	LEAQ 3(CX)(SI*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_1
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_1:
+	LEAQ (BX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_repeat_emit_lits_encodeBlockAsm16K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_repeat_emit_lits_encodeBlockAsm16K
+	JB   three_bytes_repeat_emit_lits_encodeBlockAsm16K
+
+three_bytes_repeat_emit_lits_encodeBlockAsm16K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm16K
+
+two_bytes_repeat_emit_lits_encodeBlockAsm16K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midrepeat_emit_lits_encodeBlockAsm16K
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm16K
+
+one_byte_repeat_emit_lits_encodeBlockAsm16K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_8through16:
+	MOVOU (R8), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm16K
+
+memmove_midrepeat_emit_lits_encodeBlockAsm16K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm16K
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm16K:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm16K
+
+memmove_long_repeat_emit_lits_encodeBlockAsm16K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  SI, R12
+	JAE   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+repeat_emit_lits_end_encodeBlockAsm16K:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBlockAsm16K
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm16K:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm16K
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm16K
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_repeat_extend_encodeBlockAsm16K:
+	CMPL R8, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBlockAsm16K
+	JMP  matchlen_match8_repeat_extend_encodeBlockAsm16K
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm16K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm16K
+
+matchlen_match8_repeat_extend_encodeBlockAsm16K:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm16K
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm16K
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm16K
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm16K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm16K
+
+matchlen_match4_repeat_extend_encodeBlockAsm16K:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm16K
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm16K
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm16K:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm16K
+	JB   repeat_extend_forward_end_encodeBlockAsm16K
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm16K
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm16K
+
+matchlen_match1_repeat_extend_encodeBlockAsm16K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm16K
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm16K:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitRepeat
+	LEAL -1(SI), DI
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBlockAsm16K
+	LEAL -30(SI), DI
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBlockAsm16K
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBlockAsm16K
+	MOVB $0xfc, (CX)
+	MOVL DI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm16K
+
+repeat_three_match_repeat_encodeBlockAsm16K:
+	MOVB $0xf4, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm16K
+
+repeat_two_match_repeat_encodeBlockAsm16K:
+	MOVB $0xec, (CX)
+	MOVB DI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm16K
+
+repeat_one_match_repeat_encodeBlockAsm16K:
+	XORL DI, DI
+	LEAL -4(DI)(SI*8), DI
+	MOVB DI, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBlockAsm16K:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm16K
+
+no_repeat_found_encodeBlockAsm16K:
+	CMPL    (BX)(SI*1), DI
+	JEQ     candidate_match_encodeBlockAsm16K
+	SHRQ    $0x08, DI
+	MOVWLZX (AX)(R10*2), SI
+	LEAL    2(DX), R9
+	CMPL    (BX)(R8*1), DI
+	JEQ     candidate2_match_encodeBlockAsm16K
+	MOVW    R9, (AX)(R10*2)
+	SHRQ    $0x08, DI
+	CMPL    (BX)(SI*1), DI
+	JEQ     candidate3_match_encodeBlockAsm16K
+	MOVL    20(SP), DX
+	JMP     search_loop_encodeBlockAsm16K
+
+candidate3_match_encodeBlockAsm16K:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm16K
+
+candidate2_match_encodeBlockAsm16K:
+	MOVW R9, (AX)(R10*2)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm16K:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm16K
+
+match_extend_back_loop_encodeBlockAsm16K:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm16K
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm16K
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm16K
+	JMP  match_extend_back_loop_encodeBlockAsm16K
+
+match_extend_back_end_encodeBlockAsm16K:
+	CMPQ CX, (SP)
+	JB   dst_size_check_ok_2
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_2:
+	MOVL DX, R8
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBlockAsm16K
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm16K:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm16K
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm16K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBlockAsm16K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBlockAsm16K
+	JMP  matchlen_match8_match_nolit_encodeBlockAsm16K
+
+matchlen_bsf_16match_nolit_encodeBlockAsm16K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBlockAsm16K
+
+matchlen_match8_match_nolit_encodeBlockAsm16K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm16K
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm16K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm16K
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm16K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBlockAsm16K
+
+matchlen_match4_match_nolit_encodeBlockAsm16K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm16K
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm16K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBlockAsm16K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm16K
+	JB   match_nolit_end_encodeBlockAsm16K
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm16K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm16K
+
+matchlen_match1_match_nolit_encodeBlockAsm16K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  match_nolit_end_encodeBlockAsm16K
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBlockAsm16K:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL 16(SP), SI
+	MOVL 12(SP), DI
+	MOVL DX, 12(SP)
+	SUBL DI, R8
+	JZ   match_nolits_copy_encodeBlockAsm16K
+	LEAQ (BX)(DI*1), DI
+	CMPL R8, $0x03
+	JA   match_emit_lits_copy_encodeBlockAsm16K
+	CMPL SI, $0x40
+	JB   match_emit_lits_copy_encodeBlockAsm16K
+	MOVL (DI), DI
+
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, SI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    SI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, SI
+	CMOVLLT R11, SI
+	LEAL    -1(R8)(SI*4), SI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(SI*8), SI
+	MOVB    SI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    DI, (CX)
+	ADDQ    R8, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBlockAsm16K
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBlockAsm16K
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBlockAsm16K
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBlockAsm16K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+repeat_three_match_emit_repeat_copy2_encodeBlockAsm16K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+repeat_two_match_emit_repeat_copy2_encodeBlockAsm16K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+repeat_one_match_emit_repeat_copy2_encodeBlockAsm16K:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+match_emit_lits_copy_encodeBlockAsm16K:
+	LEAQ 3(CX)(R8*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_3
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_3:
+	// emitLiteral
+	LEAL -1(R8), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBlockAsm16K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm16K
+	JB   three_bytes_match_emit_encodeBlockAsm16K
+
+three_bytes_match_emit_encodeBlockAsm16K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBlockAsm16K
+
+two_bytes_match_emit_encodeBlockAsm16K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBlockAsm16K
+	JMP  memmove_long_match_emit_encodeBlockAsm16K
+
+one_byte_match_emit_encodeBlockAsm16K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_8through16:
+	MOVOU (DI), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm16K
+
+emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm16K
+
+emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm16K:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm16K
+
+memmove_midmatch_emit_encodeBlockAsm16K:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBlockAsm16K
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm16K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_mid_end_copy_match_emit_encodeBlockAsm16K:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm16K
+
+memmove_long_match_emit_encodeBlockAsm16K:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R13*1), X4
+	MOVOU -16(DI)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  R9, CX
+
+match_nolits_copy_encodeBlockAsm16K:
+	// emitCopy
+	CMPL SI, $0x00000400
+	JA   two_byte_match_nolit_encodeBlockAsm16K
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBlockAsm16K
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+emit_one_longer_match_nolit_encodeBlockAsm16K:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBlockAsm16K
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+emit_copy1_repeat_match_nolit_encodeBlockAsm16K:
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm16K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+two_byte_match_nolit_encodeBlockAsm16K:
+	// emitCopy2
+	LEAL -64(SI), SI
+	LEAL -4(R11), R11
+	MOVW SI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBlockAsm16K_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBlockAsm16K_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBlockAsm16K_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+emit_copy2_2_match_nolit_encodeBlockAsm16K_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+emit_copy2_1_match_nolit_encodeBlockAsm16K_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm16K
+
+emit_copy2_0_match_nolit_encodeBlockAsm16K_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm16K:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm16K
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm16K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm16K:
+	MOVQ    $0x000000cf1bbcdcbb, SI
+	MOVQ    DI, R8
+	SHRQ    $0x10, DI
+	MOVQ    DI, R9
+	SHLQ    $0x18, R8
+	IMULQ   SI, R8
+	SHRQ    $0x34, R8
+	SHLQ    $0x18, R9
+	IMULQ   SI, R9
+	SHRQ    $0x34, R9
+	LEAL    -2(DX), R10
+	MOVWLZX (AX)(R9*2), SI
+	MOVW    R10, (AX)(R8*2)
+	MOVW    DX, (AX)(R9*2)
+	MOVL    DX, R8
+	INCL    DX
+	CMPL    (BX)(SI*1), DI
+	JNE     search_loop_encodeBlockAsm16K
+	MOVL    R8, DI
+	SUBL    SI, DI
+	MOVL    DI, 16(SP)
+	CMPQ    CX, (SP)
+	JB      dst_size_check_ok_4
+	MOVQ    $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_4:
+	ADDL $0x03, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit2_encodeBlockAsm16K
+
+matchlen_loopback_16_match_nolit2_encodeBlockAsm16K:
+	MOVQ (R8)(R11*1), R9
+	MOVQ 8(R8)(R11*1), R10
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm16K
+	XORQ 8(SI)(R11*1), R10
+	JNZ  matchlen_bsf_16match_nolit2_encodeBlockAsm16K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit2_encodeBlockAsm16K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit2_encodeBlockAsm16K
+	JMP  matchlen_match8_match_nolit2_encodeBlockAsm16K
+
+matchlen_bsf_16match_nolit2_encodeBlockAsm16K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R11)(R10*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm16K
+
+matchlen_match8_match_nolit2_encodeBlockAsm16K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit2_encodeBlockAsm16K
+	MOVQ (R8)(R11*1), R9
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm16K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit2_encodeBlockAsm16K
+
+matchlen_bsf_8_match_nolit2_encodeBlockAsm16K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R11)(R9*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm16K
+
+matchlen_match4_match_nolit2_encodeBlockAsm16K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit2_encodeBlockAsm16K
+	MOVL (R8)(R11*1), R9
+	CMPL (SI)(R11*1), R9
+	JNE  matchlen_match2_match_nolit2_encodeBlockAsm16K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit2_encodeBlockAsm16K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit2_encodeBlockAsm16K
+	JB   match_nolit2_end_encodeBlockAsm16K
+	MOVW (R8)(R11*1), R9
+	CMPW (SI)(R11*1), R9
+	JNE  matchlen_match1_match_nolit2_encodeBlockAsm16K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit2_end_encodeBlockAsm16K
+
+matchlen_match1_match_nolit2_encodeBlockAsm16K:
+	MOVB (R8)(R11*1), R9
+	CMPB (SI)(R11*1), R9
+	JNE  match_nolit2_end_encodeBlockAsm16K
+	LEAL 1(R11), R11
+
+match_nolit2_end_encodeBlockAsm16K:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL DX, 12(SP)
+	MOVL 16(SP), SI
+	JMP  match_nolits_copy_encodeBlockAsm16K
+
+emit_remainder_encodeBlockAsm16K:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	SUBL DX, AX
+	JZ   emit_remainder_end_encodeBlockAsm16K
+	LEAQ (BX)(DX*1), DX
+	LEAQ 3(CX)(AX*1), BX
+	CMPQ BX, (SP)
+	JB   dst_size_check_ok_5
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_5:
+	// emitLiteral
+	LEAL -1(AX), BX
+	CMPL BX, $0x1d
+	JB   one_byte_emit_remainder_encodeBlockAsm16K
+	SUBL $0x1d, BX
+	CMPL BX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm16K
+	JB   three_bytes_emit_remainder_encodeBlockAsm16K
+
+three_bytes_emit_remainder_encodeBlockAsm16K:
+	MOVB $0xf0, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, BX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm16K
+
+two_bytes_emit_remainder_encodeBlockAsm16K:
+	MOVB $0xe8, (CX)
+	MOVB BL, 1(CX)
+	ADDL $0x1d, BX
+	ADDQ $0x02, CX
+	CMPL BX, $0x40
+	JB   memmove_midemit_remainder_encodeBlockAsm16K
+	JMP  memmove_long_emit_remainder_encodeBlockAsm16K
+
+one_byte_emit_remainder_encodeBlockAsm16K:
+	SHLB $0x03, BL
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 1
+	CMPQ AX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_3
+	CMPQ AX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_4through8
+	CMPQ AX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_8through16
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_1or2:
+	MOVB (DX), SI
+	MOVB -1(DX)(AX*1), DL
+	MOVB SI, (CX)
+	MOVB DL, -1(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm16K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_3:
+	MOVW (DX), SI
+	MOVB 2(DX), DL
+	MOVW SI, (CX)
+	MOVB DL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm16K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_4through8:
+	MOVL (DX), SI
+	MOVL -4(DX)(AX*1), DX
+	MOVL SI, (CX)
+	MOVL DX, -4(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm16K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_8through16:
+	MOVQ (DX), SI
+	MOVQ -8(DX)(AX*1), DX
+	MOVQ SI, (CX)
+	MOVQ DX, -8(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm16K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm16K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm16K:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm16K
+
+memmove_midemit_remainder_encodeBlockAsm16K:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 30
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBlockAsm16K
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm16K_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBlockAsm16K:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm16K
+
+memmove_long_emit_remainder_encodeBlockAsm16K:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVQ  AX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R8*1), X4
+	MOVOU -16(DX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  AX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+	MOVQ  BX, CX
+
+emit_remainder_end_encodeBlockAsm16K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBlockAsm4K(dst []byte, src []byte, tmp *[2048]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBlockAsm4K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000010, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm4K:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm4K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -17(DX), BX
+	LEAQ  -17(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm4K:
+	MOVL    DX, SI
+	SUBL    12(SP), SI
+	SHRL    $0x05, SI
+	LEAL    4(DX)(SI*1), SI
+	CMPL    SI, 8(SP)
+	JAE     emit_remainder_encodeBlockAsm4K
+	MOVQ    (BX)(DX*1), DI
+	MOVL    SI, 20(SP)
+	MOVQ    $0x9e3779b1, R9
+	MOVQ    DI, R10
+	MOVQ    DI, R11
+	SHRQ    $0x08, R11
+	SHLQ    $0x20, R10
+	IMULQ   R9, R10
+	SHRQ    $0x36, R10
+	SHLQ    $0x20, R11
+	IMULQ   R9, R11
+	SHRQ    $0x36, R11
+	MOVWLZX (AX)(R10*2), SI
+	MOVWLZX (AX)(R11*2), R8
+	MOVW    DX, (AX)(R10*2)
+	MOVW    DX, (AX)(R11*2)
+	MOVQ    DI, R10
+	SHRQ    $0x10, R10
+	SHLQ    $0x20, R10
+	IMULQ   R9, R10
+	SHRQ    $0x36, R10
+	MOVL    DX, R9
+	SUBL    16(SP), R9
+	MOVL    1(BX)(R9*1), R11
+	MOVQ    DI, R9
+	SHRQ    $0x08, R9
+	CMPL    R9, R11
+	JNE     no_repeat_found_encodeBlockAsm4K
+	LEAL    1(DX), DI
+	MOVL    12(SP), SI
+	MOVL    DI, R8
+	SUBL    16(SP), R8
+	JZ      repeat_extend_back_end_encodeBlockAsm4K
+
+repeat_extend_back_loop_encodeBlockAsm4K:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeBlockAsm4K
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm4K
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeBlockAsm4K
+
+repeat_extend_back_end_encodeBlockAsm4K:
+	MOVL DI, SI
+	MOVL 12(SP), R8
+	SUBL R8, SI
+	LEAQ 3(CX)(SI*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_1
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_1:
+	LEAQ (BX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_repeat_emit_lits_encodeBlockAsm4K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_repeat_emit_lits_encodeBlockAsm4K
+	JB   three_bytes_repeat_emit_lits_encodeBlockAsm4K
+
+three_bytes_repeat_emit_lits_encodeBlockAsm4K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm4K
+
+two_bytes_repeat_emit_lits_encodeBlockAsm4K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midrepeat_emit_lits_encodeBlockAsm4K
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm4K
+
+one_byte_repeat_emit_lits_encodeBlockAsm4K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_8through16:
+	MOVOU (R8), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm4K
+
+memmove_midrepeat_emit_lits_encodeBlockAsm4K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm4K
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm4K:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm4K
+
+memmove_long_repeat_emit_lits_encodeBlockAsm4K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  SI, R12
+	JAE   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+repeat_emit_lits_end_encodeBlockAsm4K:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBlockAsm4K
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm4K:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm4K
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm4K
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_repeat_extend_encodeBlockAsm4K:
+	CMPL R8, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBlockAsm4K
+	JMP  matchlen_match8_repeat_extend_encodeBlockAsm4K
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm4K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm4K
+
+matchlen_match8_repeat_extend_encodeBlockAsm4K:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm4K
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm4K
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm4K
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm4K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm4K
+
+matchlen_match4_repeat_extend_encodeBlockAsm4K:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm4K
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm4K
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm4K:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm4K
+	JB   repeat_extend_forward_end_encodeBlockAsm4K
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm4K
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm4K
+
+matchlen_match1_repeat_extend_encodeBlockAsm4K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm4K
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm4K:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitRepeat
+	LEAL -1(SI), DI
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBlockAsm4K
+	LEAL -30(SI), DI
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBlockAsm4K
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBlockAsm4K
+	MOVB $0xfc, (CX)
+	MOVL DI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm4K
+
+repeat_three_match_repeat_encodeBlockAsm4K:
+	MOVB $0xf4, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm4K
+
+repeat_two_match_repeat_encodeBlockAsm4K:
+	MOVB $0xec, (CX)
+	MOVB DI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm4K
+
+repeat_one_match_repeat_encodeBlockAsm4K:
+	XORL DI, DI
+	LEAL -4(DI)(SI*8), DI
+	MOVB DI, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBlockAsm4K:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm4K
+
+no_repeat_found_encodeBlockAsm4K:
+	CMPL    (BX)(SI*1), DI
+	JEQ     candidate_match_encodeBlockAsm4K
+	SHRQ    $0x08, DI
+	MOVWLZX (AX)(R10*2), SI
+	LEAL    2(DX), R9
+	CMPL    (BX)(R8*1), DI
+	JEQ     candidate2_match_encodeBlockAsm4K
+	MOVW    R9, (AX)(R10*2)
+	SHRQ    $0x08, DI
+	CMPL    (BX)(SI*1), DI
+	JEQ     candidate3_match_encodeBlockAsm4K
+	MOVL    20(SP), DX
+	JMP     search_loop_encodeBlockAsm4K
+
+candidate3_match_encodeBlockAsm4K:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm4K
+
+candidate2_match_encodeBlockAsm4K:
+	MOVW R9, (AX)(R10*2)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm4K:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm4K
+
+match_extend_back_loop_encodeBlockAsm4K:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm4K
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm4K
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm4K
+	JMP  match_extend_back_loop_encodeBlockAsm4K
+
+match_extend_back_end_encodeBlockAsm4K:
+	CMPQ CX, (SP)
+	JB   dst_size_check_ok_2
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_2:
+	MOVL DX, R8
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBlockAsm4K
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm4K:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm4K
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm4K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBlockAsm4K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBlockAsm4K
+	JMP  matchlen_match8_match_nolit_encodeBlockAsm4K
+
+matchlen_bsf_16match_nolit_encodeBlockAsm4K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBlockAsm4K
+
+matchlen_match8_match_nolit_encodeBlockAsm4K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm4K
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm4K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm4K
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm4K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBlockAsm4K
+
+matchlen_match4_match_nolit_encodeBlockAsm4K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm4K
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm4K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBlockAsm4K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm4K
+	JB   match_nolit_end_encodeBlockAsm4K
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm4K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm4K
+
+matchlen_match1_match_nolit_encodeBlockAsm4K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  match_nolit_end_encodeBlockAsm4K
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBlockAsm4K:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL 16(SP), SI
+	MOVL 12(SP), DI
+	MOVL DX, 12(SP)
+	SUBL DI, R8
+	JZ   match_nolits_copy_encodeBlockAsm4K
+	LEAQ (BX)(DI*1), DI
+	CMPL R8, $0x03
+	JA   match_emit_lits_copy_encodeBlockAsm4K
+	CMPL SI, $0x40
+	JB   match_emit_lits_copy_encodeBlockAsm4K
+	MOVL (DI), DI
+
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, SI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    SI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, SI
+	CMOVLLT R11, SI
+	LEAL    -1(R8)(SI*4), SI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(SI*8), SI
+	MOVB    SI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    DI, (CX)
+	ADDQ    R8, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBlockAsm4K
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBlockAsm4K
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBlockAsm4K
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBlockAsm4K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+repeat_three_match_emit_repeat_copy2_encodeBlockAsm4K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+repeat_two_match_emit_repeat_copy2_encodeBlockAsm4K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+repeat_one_match_emit_repeat_copy2_encodeBlockAsm4K:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+match_emit_lits_copy_encodeBlockAsm4K:
+	LEAQ 3(CX)(R8*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_3
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_3:
+	// emitLiteral
+	LEAL -1(R8), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBlockAsm4K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm4K
+	JB   three_bytes_match_emit_encodeBlockAsm4K
+
+three_bytes_match_emit_encodeBlockAsm4K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBlockAsm4K
+
+two_bytes_match_emit_encodeBlockAsm4K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBlockAsm4K
+	JMP  memmove_long_match_emit_encodeBlockAsm4K
+
+one_byte_match_emit_encodeBlockAsm4K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_8through16:
+	MOVOU (DI), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm4K
+
+emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm4K
+
+emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm4K:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm4K
+
+memmove_midmatch_emit_encodeBlockAsm4K:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBlockAsm4K
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm4K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_mid_end_copy_match_emit_encodeBlockAsm4K:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm4K
+
+memmove_long_match_emit_encodeBlockAsm4K:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R13*1), X4
+	MOVOU -16(DI)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  R9, CX
+
+match_nolits_copy_encodeBlockAsm4K:
+	// emitCopy
+	CMPL SI, $0x00000400
+	JA   two_byte_match_nolit_encodeBlockAsm4K
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBlockAsm4K
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+emit_one_longer_match_nolit_encodeBlockAsm4K:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBlockAsm4K
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+emit_copy1_repeat_match_nolit_encodeBlockAsm4K:
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm4K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+two_byte_match_nolit_encodeBlockAsm4K:
+	// emitCopy2
+	LEAL -64(SI), SI
+	LEAL -4(R11), R11
+	MOVW SI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBlockAsm4K_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBlockAsm4K_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBlockAsm4K_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+emit_copy2_2_match_nolit_encodeBlockAsm4K_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+emit_copy2_1_match_nolit_encodeBlockAsm4K_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm4K
+
+emit_copy2_0_match_nolit_encodeBlockAsm4K_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm4K:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm4K
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm4K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm4K:
+	MOVQ    $0x9e3779b1, SI
+	MOVQ    DI, R8
+	SHRQ    $0x10, DI
+	MOVQ    DI, R9
+	SHLQ    $0x20, R8
+	IMULQ   SI, R8
+	SHRQ    $0x36, R8
+	SHLQ    $0x20, R9
+	IMULQ   SI, R9
+	SHRQ    $0x36, R9
+	LEAL    -2(DX), R10
+	MOVWLZX (AX)(R9*2), SI
+	MOVW    R10, (AX)(R8*2)
+	MOVW    DX, (AX)(R9*2)
+	MOVL    DX, R8
+	INCL    DX
+	CMPL    (BX)(SI*1), DI
+	JNE     search_loop_encodeBlockAsm4K
+	MOVL    R8, DI
+	SUBL    SI, DI
+	MOVL    DI, 16(SP)
+	CMPQ    CX, (SP)
+	JB      dst_size_check_ok_4
+	MOVQ    $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_4:
+	ADDL $0x03, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit2_encodeBlockAsm4K
+
+matchlen_loopback_16_match_nolit2_encodeBlockAsm4K:
+	MOVQ (R8)(R11*1), R9
+	MOVQ 8(R8)(R11*1), R10
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm4K
+	XORQ 8(SI)(R11*1), R10
+	JNZ  matchlen_bsf_16match_nolit2_encodeBlockAsm4K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit2_encodeBlockAsm4K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit2_encodeBlockAsm4K
+	JMP  matchlen_match8_match_nolit2_encodeBlockAsm4K
+
+matchlen_bsf_16match_nolit2_encodeBlockAsm4K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R11)(R10*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm4K
+
+matchlen_match8_match_nolit2_encodeBlockAsm4K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit2_encodeBlockAsm4K
+	MOVQ (R8)(R11*1), R9
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm4K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit2_encodeBlockAsm4K
+
+matchlen_bsf_8_match_nolit2_encodeBlockAsm4K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R11)(R9*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm4K
+
+matchlen_match4_match_nolit2_encodeBlockAsm4K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit2_encodeBlockAsm4K
+	MOVL (R8)(R11*1), R9
+	CMPL (SI)(R11*1), R9
+	JNE  matchlen_match2_match_nolit2_encodeBlockAsm4K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit2_encodeBlockAsm4K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit2_encodeBlockAsm4K
+	JB   match_nolit2_end_encodeBlockAsm4K
+	MOVW (R8)(R11*1), R9
+	CMPW (SI)(R11*1), R9
+	JNE  matchlen_match1_match_nolit2_encodeBlockAsm4K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit2_end_encodeBlockAsm4K
+
+matchlen_match1_match_nolit2_encodeBlockAsm4K:
+	MOVB (R8)(R11*1), R9
+	CMPB (SI)(R11*1), R9
+	JNE  match_nolit2_end_encodeBlockAsm4K
+	LEAL 1(R11), R11
+
+match_nolit2_end_encodeBlockAsm4K:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL DX, 12(SP)
+	MOVL 16(SP), SI
+	JMP  match_nolits_copy_encodeBlockAsm4K
+
+emit_remainder_encodeBlockAsm4K:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	SUBL DX, AX
+	JZ   emit_remainder_end_encodeBlockAsm4K
+	LEAQ (BX)(DX*1), DX
+	LEAQ 3(CX)(AX*1), BX
+	CMPQ BX, (SP)
+	JB   dst_size_check_ok_5
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_5:
+	// emitLiteral
+	LEAL -1(AX), BX
+	CMPL BX, $0x1d
+	JB   one_byte_emit_remainder_encodeBlockAsm4K
+	SUBL $0x1d, BX
+	CMPL BX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm4K
+	JB   three_bytes_emit_remainder_encodeBlockAsm4K
+
+three_bytes_emit_remainder_encodeBlockAsm4K:
+	MOVB $0xf0, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, BX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm4K
+
+two_bytes_emit_remainder_encodeBlockAsm4K:
+	MOVB $0xe8, (CX)
+	MOVB BL, 1(CX)
+	ADDL $0x1d, BX
+	ADDQ $0x02, CX
+	CMPL BX, $0x40
+	JB   memmove_midemit_remainder_encodeBlockAsm4K
+	JMP  memmove_long_emit_remainder_encodeBlockAsm4K
+
+one_byte_emit_remainder_encodeBlockAsm4K:
+	SHLB $0x03, BL
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 1
+	CMPQ AX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_3
+	CMPQ AX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_4through8
+	CMPQ AX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_8through16
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_1or2:
+	MOVB (DX), SI
+	MOVB -1(DX)(AX*1), DL
+	MOVB SI, (CX)
+	MOVB DL, -1(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_3:
+	MOVW (DX), SI
+	MOVB 2(DX), DL
+	MOVW SI, (CX)
+	MOVB DL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_4through8:
+	MOVL (DX), SI
+	MOVL -4(DX)(AX*1), DX
+	MOVL SI, (CX)
+	MOVL DX, -4(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_8through16:
+	MOVQ (DX), SI
+	MOVQ -8(DX)(AX*1), DX
+	MOVQ SI, (CX)
+	MOVQ DX, -8(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm4K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm4K:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm4K
+
+memmove_midemit_remainder_encodeBlockAsm4K:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 30
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBlockAsm4K
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm4K_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBlockAsm4K:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm4K
+
+memmove_long_emit_remainder_encodeBlockAsm4K:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVQ  AX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R8*1), X4
+	MOVOU -16(DX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  AX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+	MOVQ  BX, CX
+
+emit_remainder_end_encodeBlockAsm4K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBlockAsm1K(dst []byte, src []byte, tmp *[1024]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBlockAsm1K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000008, DX
+	MOVQ AX, BX
+	PXOR X0, X0
+
+zero_loop_encodeBlockAsm1K:
+	MOVOU X0, (BX)
+	MOVOU X0, 16(BX)
+	MOVOU X0, 32(BX)
+	MOVOU X0, 48(BX)
+	MOVOU X0, 64(BX)
+	MOVOU X0, 80(BX)
+	MOVOU X0, 96(BX)
+	MOVOU X0, 112(BX)
+	ADDQ  $0x80, BX
+	DECQ  DX
+	JNZ   zero_loop_encodeBlockAsm1K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), DX
+	LEAQ  -17(DX), BX
+	LEAQ  -17(DX), SI
+	MOVL  SI, 8(SP)
+	SHRQ  $0x05, DX
+	SUBL  DX, BX
+	LEAQ  (CX)(BX*1), BX
+	MOVQ  BX, (SP)
+	MOVL  $0x00000001, DX
+	MOVL  DX, 16(SP)
+	MOVQ  src_base+24(FP), BX
+
+search_loop_encodeBlockAsm1K:
+	MOVL    DX, SI
+	SUBL    12(SP), SI
+	SHRL    $0x04, SI
+	LEAL    4(DX)(SI*1), SI
+	CMPL    SI, 8(SP)
+	JAE     emit_remainder_encodeBlockAsm1K
+	MOVQ    (BX)(DX*1), DI
+	MOVL    SI, 20(SP)
+	MOVQ    $0x9e3779b1, R9
+	MOVQ    DI, R10
+	MOVQ    DI, R11
+	SHRQ    $0x08, R11
+	SHLQ    $0x20, R10
+	IMULQ   R9, R10
+	SHRQ    $0x37, R10
+	SHLQ    $0x20, R11
+	IMULQ   R9, R11
+	SHRQ    $0x37, R11
+	MOVWLZX (AX)(R10*2), SI
+	MOVWLZX (AX)(R11*2), R8
+	MOVW    DX, (AX)(R10*2)
+	MOVW    DX, (AX)(R11*2)
+	MOVQ    DI, R10
+	SHRQ    $0x10, R10
+	SHLQ    $0x20, R10
+	IMULQ   R9, R10
+	SHRQ    $0x37, R10
+	MOVL    DX, R9
+	SUBL    16(SP), R9
+	MOVL    1(BX)(R9*1), R11
+	MOVQ    DI, R9
+	SHRQ    $0x08, R9
+	CMPL    R9, R11
+	JNE     no_repeat_found_encodeBlockAsm1K
+	LEAL    1(DX), DI
+	MOVL    12(SP), SI
+	MOVL    DI, R8
+	SUBL    16(SP), R8
+	JZ      repeat_extend_back_end_encodeBlockAsm1K
+
+repeat_extend_back_loop_encodeBlockAsm1K:
+	CMPL DI, SI
+	JBE  repeat_extend_back_end_encodeBlockAsm1K
+	MOVB -1(BX)(R8*1), R9
+	MOVB -1(BX)(DI*1), R10
+	CMPB R9, R10
+	JNE  repeat_extend_back_end_encodeBlockAsm1K
+	LEAL -1(DI), DI
+	DECL R8
+	JNZ  repeat_extend_back_loop_encodeBlockAsm1K
+
+repeat_extend_back_end_encodeBlockAsm1K:
+	MOVL DI, SI
+	MOVL 12(SP), R8
+	SUBL R8, SI
+	LEAQ 3(CX)(SI*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_1
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_1:
+	LEAQ (BX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_repeat_emit_lits_encodeBlockAsm1K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_repeat_emit_lits_encodeBlockAsm1K
+	JB   three_bytes_repeat_emit_lits_encodeBlockAsm1K
+
+three_bytes_repeat_emit_lits_encodeBlockAsm1K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm1K
+
+two_bytes_repeat_emit_lits_encodeBlockAsm1K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midrepeat_emit_lits_encodeBlockAsm1K
+	JMP  memmove_long_repeat_emit_lits_encodeBlockAsm1K
+
+one_byte_repeat_emit_lits_encodeBlockAsm1K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_8through16:
+	MOVOU (R8), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K
+
+emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm1K
+
+memmove_midrepeat_emit_lits_encodeBlockAsm1K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm1K
+
+emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm1K:
+	MOVQ R9, CX
+	JMP  repeat_emit_lits_end_encodeBlockAsm1K
+
+memmove_long_repeat_emit_lits_encodeBlockAsm1K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R11
+	SHRQ  $0x05, R11
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R10
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  SI, R12
+	JAE   emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+repeat_emit_lits_end_encodeBlockAsm1K:
+	ADDL $0x05, DX
+	MOVL DX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), R8
+	SUBL DX, R8
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBlockAsm1K
+
+matchlen_loopback_16_repeat_extend_encodeBlockAsm1K:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm1K
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16repeat_extend_encodeBlockAsm1K
+	LEAL -16(R8), R8
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_repeat_extend_encodeBlockAsm1K:
+	CMPL R8, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBlockAsm1K
+	JMP  matchlen_match8_repeat_extend_encodeBlockAsm1K
+
+matchlen_bsf_16repeat_extend_encodeBlockAsm1K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm1K
+
+matchlen_match8_repeat_extend_encodeBlockAsm1K:
+	CMPL R8, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBlockAsm1K
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBlockAsm1K
+	LEAL -8(R8), R8
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_repeat_extend_encodeBlockAsm1K
+
+matchlen_bsf_8_repeat_extend_encodeBlockAsm1K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  repeat_extend_forward_end_encodeBlockAsm1K
+
+matchlen_match4_repeat_extend_encodeBlockAsm1K:
+	CMPL R8, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBlockAsm1K
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_repeat_extend_encodeBlockAsm1K
+	LEAL -4(R8), R8
+	LEAL 4(R11), R11
+
+matchlen_match2_repeat_extend_encodeBlockAsm1K:
+	CMPL R8, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBlockAsm1K
+	JB   repeat_extend_forward_end_encodeBlockAsm1K
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_repeat_extend_encodeBlockAsm1K
+	LEAL 2(R11), R11
+	SUBL $0x02, R8
+	JZ   repeat_extend_forward_end_encodeBlockAsm1K
+
+matchlen_match1_repeat_extend_encodeBlockAsm1K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  repeat_extend_forward_end_encodeBlockAsm1K
+	LEAL 1(R11), R11
+
+repeat_extend_forward_end_encodeBlockAsm1K:
+	ADDL R11, DX
+	MOVL DX, SI
+	SUBL DI, SI
+	MOVL 16(SP), DI
+
+	// emitRepeat
+	LEAL -1(SI), DI
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBlockAsm1K
+	LEAL -30(SI), DI
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBlockAsm1K
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBlockAsm1K
+	MOVB $0xfc, (CX)
+	MOVL DI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBlockAsm1K
+
+repeat_three_match_repeat_encodeBlockAsm1K:
+	MOVB $0xf4, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBlockAsm1K
+
+repeat_two_match_repeat_encodeBlockAsm1K:
+	MOVB $0xec, (CX)
+	MOVB DI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBlockAsm1K
+
+repeat_one_match_repeat_encodeBlockAsm1K:
+	XORL DI, DI
+	LEAL -4(DI)(SI*8), DI
+	MOVB DI, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBlockAsm1K:
+	MOVL DX, 12(SP)
+	JMP  search_loop_encodeBlockAsm1K
+
+no_repeat_found_encodeBlockAsm1K:
+	CMPL    (BX)(SI*1), DI
+	JEQ     candidate_match_encodeBlockAsm1K
+	SHRQ    $0x08, DI
+	MOVWLZX (AX)(R10*2), SI
+	LEAL    2(DX), R9
+	CMPL    (BX)(R8*1), DI
+	JEQ     candidate2_match_encodeBlockAsm1K
+	MOVW    R9, (AX)(R10*2)
+	SHRQ    $0x08, DI
+	CMPL    (BX)(SI*1), DI
+	JEQ     candidate3_match_encodeBlockAsm1K
+	MOVL    20(SP), DX
+	JMP     search_loop_encodeBlockAsm1K
+
+candidate3_match_encodeBlockAsm1K:
+	ADDL $0x02, DX
+	JMP  candidate_match_encodeBlockAsm1K
+
+candidate2_match_encodeBlockAsm1K:
+	MOVW R9, (AX)(R10*2)
+	INCL DX
+	MOVL R8, SI
+
+candidate_match_encodeBlockAsm1K:
+	MOVL  12(SP), DI
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBlockAsm1K
+
+match_extend_back_loop_encodeBlockAsm1K:
+	CMPL DX, DI
+	JBE  match_extend_back_end_encodeBlockAsm1K
+	MOVB -1(BX)(SI*1), R8
+	MOVB -1(BX)(DX*1), R9
+	CMPB R8, R9
+	JNE  match_extend_back_end_encodeBlockAsm1K
+	LEAL -1(DX), DX
+	DECL SI
+	JZ   match_extend_back_end_encodeBlockAsm1K
+	JMP  match_extend_back_loop_encodeBlockAsm1K
+
+match_extend_back_end_encodeBlockAsm1K:
+	CMPQ CX, (SP)
+	JB   dst_size_check_ok_2
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_2:
+	MOVL DX, R8
+	MOVL DX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+	ADDL $0x04, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R9
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBlockAsm1K
+
+matchlen_loopback_16_match_nolit_encodeBlockAsm1K:
+	MOVQ (R9)(R11*1), R10
+	MOVQ 8(R9)(R11*1), R12
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm1K
+	XORQ 8(SI)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBlockAsm1K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBlockAsm1K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBlockAsm1K
+	JMP  matchlen_match8_match_nolit_encodeBlockAsm1K
+
+matchlen_bsf_16match_nolit_encodeBlockAsm1K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBlockAsm1K
+
+matchlen_match8_match_nolit_encodeBlockAsm1K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBlockAsm1K
+	MOVQ (R9)(R11*1), R10
+	XORQ (SI)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBlockAsm1K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBlockAsm1K
+
+matchlen_bsf_8_match_nolit_encodeBlockAsm1K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBlockAsm1K
+
+matchlen_match4_match_nolit_encodeBlockAsm1K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBlockAsm1K
+	MOVL (R9)(R11*1), R10
+	CMPL (SI)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBlockAsm1K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBlockAsm1K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBlockAsm1K
+	JB   match_nolit_end_encodeBlockAsm1K
+	MOVW (R9)(R11*1), R10
+	CMPW (SI)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBlockAsm1K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBlockAsm1K
+
+matchlen_match1_match_nolit_encodeBlockAsm1K:
+	MOVB (R9)(R11*1), R10
+	CMPB (SI)(R11*1), R10
+	JNE  match_nolit_end_encodeBlockAsm1K
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBlockAsm1K:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL 16(SP), SI
+	MOVL 12(SP), DI
+	MOVL DX, 12(SP)
+	SUBL DI, R8
+	JZ   match_nolits_copy_encodeBlockAsm1K
+	LEAQ (BX)(DI*1), DI
+	CMPL R8, $0x03
+	JA   match_emit_lits_copy_encodeBlockAsm1K
+	CMPL SI, $0x40
+	JB   match_emit_lits_copy_encodeBlockAsm1K
+	MOVL (DI), DI
+
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, SI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    SI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, SI
+	CMOVLLT R11, SI
+	LEAL    -1(R8)(SI*4), SI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(SI*8), SI
+	MOVB    SI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    DI, (CX)
+	ADDQ    R8, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBlockAsm1K
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBlockAsm1K
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBlockAsm1K
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBlockAsm1K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+repeat_three_match_emit_repeat_copy2_encodeBlockAsm1K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+repeat_two_match_emit_repeat_copy2_encodeBlockAsm1K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+repeat_one_match_emit_repeat_copy2_encodeBlockAsm1K:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+match_emit_lits_copy_encodeBlockAsm1K:
+	LEAQ 3(CX)(R8*1), R9
+	CMPQ R9, (SP)
+	JB   dst_size_check_ok_3
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_3:
+	// emitLiteral
+	LEAL -1(R8), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBlockAsm1K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBlockAsm1K
+	JB   three_bytes_match_emit_encodeBlockAsm1K
+
+three_bytes_match_emit_encodeBlockAsm1K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBlockAsm1K
+
+two_bytes_match_emit_encodeBlockAsm1K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBlockAsm1K
+	JMP  memmove_long_match_emit_encodeBlockAsm1K
+
+one_byte_match_emit_encodeBlockAsm1K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_8through16:
+	MOVOU (DI), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm1K
+
+emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBlockAsm1K
+
+emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBlockAsm1K:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm1K
+
+memmove_midmatch_emit_encodeBlockAsm1K:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBlockAsm1K
+
+emit_lit_memmove_mid_match_emit_encodeBlockAsm1K_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_mid_end_copy_match_emit_encodeBlockAsm1K:
+	MOVQ R9, CX
+	JMP  match_nolits_copy_encodeBlockAsm1K
+
+memmove_long_match_emit_encodeBlockAsm1K:
+	LEAQ (CX)(R8*1), R9
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R8*1), X2
+	MOVOU -16(DI)(R8*1), X3
+	MOVQ  R8, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R13*1), X4
+	MOVOU -16(DI)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R8, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  R9, CX
+
+match_nolits_copy_encodeBlockAsm1K:
+	// emitCopy
+	CMPL SI, $0x00000400
+	JA   two_byte_match_nolit_encodeBlockAsm1K
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBlockAsm1K
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+emit_one_longer_match_nolit_encodeBlockAsm1K:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBlockAsm1K
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+emit_copy1_repeat_match_nolit_encodeBlockAsm1K:
+	LEAL -1(SI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBlockAsm1K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+two_byte_match_nolit_encodeBlockAsm1K:
+	// emitCopy2
+	LEAL -64(SI), SI
+	LEAL -4(R11), R11
+	MOVW SI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBlockAsm1K_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBlockAsm1K_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBlockAsm1K_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+emit_copy2_2_match_nolit_encodeBlockAsm1K_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+emit_copy2_1_match_nolit_encodeBlockAsm1K_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBlockAsm1K
+
+emit_copy2_0_match_nolit_encodeBlockAsm1K_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+
+match_nolit_emitcopy_end_encodeBlockAsm1K:
+	CMPL DX, 8(SP)
+	JAE  emit_remainder_encodeBlockAsm1K
+	MOVQ -2(BX)(DX*1), DI
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBlockAsm1K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBlockAsm1K:
+	MOVQ    $0x9e3779b1, SI
+	MOVQ    DI, R8
+	SHRQ    $0x10, DI
+	MOVQ    DI, R9
+	SHLQ    $0x20, R8
+	IMULQ   SI, R8
+	SHRQ    $0x37, R8
+	SHLQ    $0x20, R9
+	IMULQ   SI, R9
+	SHRQ    $0x37, R9
+	LEAL    -2(DX), R10
+	MOVWLZX (AX)(R9*2), SI
+	MOVW    R10, (AX)(R8*2)
+	MOVW    DX, (AX)(R9*2)
+	MOVL    DX, R8
+	INCL    DX
+	CMPL    (BX)(SI*1), DI
+	JNE     search_loop_encodeBlockAsm1K
+	MOVL    R8, DI
+	SUBL    SI, DI
+	MOVL    DI, 16(SP)
+	CMPQ    CX, (SP)
+	JB      dst_size_check_ok_4
+	MOVQ    $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_4:
+	ADDL $0x03, DX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL DX, DI
+	LEAQ (BX)(DX*1), R8
+	LEAQ (BX)(SI*1), SI
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit2_encodeBlockAsm1K
+
+matchlen_loopback_16_match_nolit2_encodeBlockAsm1K:
+	MOVQ (R8)(R11*1), R9
+	MOVQ 8(R8)(R11*1), R10
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm1K
+	XORQ 8(SI)(R11*1), R10
+	JNZ  matchlen_bsf_16match_nolit2_encodeBlockAsm1K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit2_encodeBlockAsm1K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit2_encodeBlockAsm1K
+	JMP  matchlen_match8_match_nolit2_encodeBlockAsm1K
+
+matchlen_bsf_16match_nolit2_encodeBlockAsm1K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL 8(R11)(R10*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm1K
+
+matchlen_match8_match_nolit2_encodeBlockAsm1K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit2_encodeBlockAsm1K
+	MOVQ (R8)(R11*1), R9
+	XORQ (SI)(R11*1), R9
+	JNZ  matchlen_bsf_8_match_nolit2_encodeBlockAsm1K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit2_encodeBlockAsm1K
+
+matchlen_bsf_8_match_nolit2_encodeBlockAsm1K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R11)(R9*1), R11
+	JMP  match_nolit2_end_encodeBlockAsm1K
+
+matchlen_match4_match_nolit2_encodeBlockAsm1K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit2_encodeBlockAsm1K
+	MOVL (R8)(R11*1), R9
+	CMPL (SI)(R11*1), R9
+	JNE  matchlen_match2_match_nolit2_encodeBlockAsm1K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit2_encodeBlockAsm1K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit2_encodeBlockAsm1K
+	JB   match_nolit2_end_encodeBlockAsm1K
+	MOVW (R8)(R11*1), R9
+	CMPW (SI)(R11*1), R9
+	JNE  matchlen_match1_match_nolit2_encodeBlockAsm1K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit2_end_encodeBlockAsm1K
+
+matchlen_match1_match_nolit2_encodeBlockAsm1K:
+	MOVB (R8)(R11*1), R9
+	CMPB (SI)(R11*1), R9
+	JNE  match_nolit2_end_encodeBlockAsm1K
+	LEAL 1(R11), R11
+
+match_nolit2_end_encodeBlockAsm1K:
+	ADDL R11, DX
+	ADDL $0x04, R11
+	MOVL DX, 12(SP)
+	MOVL 16(SP), SI
+	JMP  match_nolits_copy_encodeBlockAsm1K
+
+emit_remainder_encodeBlockAsm1K:
+	MOVQ src_len+32(FP), AX
+	MOVL 12(SP), DX
+	SUBL DX, AX
+	JZ   emit_remainder_end_encodeBlockAsm1K
+	LEAQ (BX)(DX*1), DX
+	LEAQ 3(CX)(AX*1), BX
+	CMPQ BX, (SP)
+	JB   dst_size_check_ok_5
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+dst_size_check_ok_5:
+	// emitLiteral
+	LEAL -1(AX), BX
+	CMPL BX, $0x1d
+	JB   one_byte_emit_remainder_encodeBlockAsm1K
+	SUBL $0x1d, BX
+	CMPL BX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBlockAsm1K
+	JB   three_bytes_emit_remainder_encodeBlockAsm1K
+
+three_bytes_emit_remainder_encodeBlockAsm1K:
+	MOVB $0xf0, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, BX
+	JMP  memmove_long_emit_remainder_encodeBlockAsm1K
+
+two_bytes_emit_remainder_encodeBlockAsm1K:
+	MOVB $0xe8, (CX)
+	MOVB BL, 1(CX)
+	ADDL $0x1d, BX
+	ADDQ $0x02, CX
+	CMPL BX, $0x40
+	JB   memmove_midemit_remainder_encodeBlockAsm1K
+	JMP  memmove_long_emit_remainder_encodeBlockAsm1K
+
+one_byte_emit_remainder_encodeBlockAsm1K:
+	SHLB $0x03, BL
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 1
+	CMPQ AX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_3
+	CMPQ AX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_4through8
+	CMPQ AX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_8through16
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_1or2:
+	MOVB (DX), SI
+	MOVB -1(DX)(AX*1), DL
+	MOVB SI, (CX)
+	MOVB DL, -1(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm1K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_3:
+	MOVW (DX), SI
+	MOVB 2(DX), DL
+	MOVW SI, (CX)
+	MOVB DL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm1K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_4through8:
+	MOVL (DX), SI
+	MOVL -4(DX)(AX*1), DX
+	MOVL SI, (CX)
+	MOVL DX, -4(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm1K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_8through16:
+	MOVQ (DX), SI
+	MOVQ -8(DX)(AX*1), DX
+	MOVQ SI, (CX)
+	MOVQ DX, -8(CX)(AX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm1K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm1K
+
+emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_end_copy_emit_remainder_encodeBlockAsm1K:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm1K
+
+memmove_midemit_remainder_encodeBlockAsm1K:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveShort
+	// margin: 0, min move: 30
+	CMPQ AX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(AX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(AX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBlockAsm1K
+
+emit_lit_memmove_mid_emit_remainder_encodeBlockAsm1K_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBlockAsm1K:
+	MOVQ BX, CX
+	JMP  emit_remainder_end_encodeBlockAsm1K
+
+memmove_long_emit_remainder_encodeBlockAsm1K:
+	LEAQ (CX)(AX*1), BX
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(AX*1), X2
+	MOVOU -16(DX)(AX*1), X3
+	MOVQ  AX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R8*1), X4
+	MOVOU -16(DX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  AX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(AX*1)
+	MOVOU X3, -16(CX)(AX*1)
+	MOVQ  BX, CX
+
+emit_remainder_end_encodeBlockAsm1K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBetterBlockAsm(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00001200, DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm:
+	MOVOU X0, (AX)
+	MOVOU X0, 16(AX)
+	MOVOU X0, 32(AX)
+	MOVOU X0, 48(AX)
+	MOVOU X0, 64(AX)
+	MOVOU X0, 80(AX)
+	MOVOU X0, 96(AX)
+	MOVOU X0, 112(AX)
+	ADDQ  $0x80, AX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), AX
+	LEAQ  -17(AX), DX
+	LEAQ  -17(AX), DI
+	MOVL  DI, 8(SP)
+	SHRQ  $0x05, AX
+	SUBL  AX, DX
+	LEAQ  (CX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, AX
+	MOVL  AX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm:
+	MOVQ tmp+48(FP), DI
+	MOVL AX, R8
+	SUBL 12(SP), R8
+	SHRL $0x08, R8
+	CMPL R8, $0x63
+	JBE  check_maxskip_ok_encodeBetterBlockAsm
+	LEAL 100(AX), R8
+	JMP  check_maxskip_cont_encodeBetterBlockAsm
+
+check_maxskip_ok_encodeBetterBlockAsm:
+	LEAL 1(AX)(R8*1), R8
+
+check_maxskip_cont_encodeBetterBlockAsm:
+	CMPL  R8, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm
+	MOVQ  (DX)(AX*1), R9
+	MOVL  R8, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R11
+	MOVQ  $0x9e3779b1, R8
+	MOVQ  R9, R12
+	MOVQ  R9, R13
+	SHLQ  $0x08, R12
+	IMULQ R11, R12
+	SHRQ  $0x2f, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x32, R13
+	MOVL  (DI)(R12*4), R8
+	MOVL  524288(DI)(R13*4), R10
+	MOVL  AX, (DI)(R12*4)
+	MOVL  AX, 524288(DI)(R13*4)
+	LEAL  -2162685(AX), R12
+	CMPL  R8, R12
+	JLE   offset_ok_0_encodeBetterBlockAsm
+	MOVQ  (DX)(R8*1), BX
+	CMPQ  BX, R9
+	JEQ   candidate_match_encodeBetterBlockAsm
+
+offset_ok_0_encodeBetterBlockAsm:
+	CMPL R10, R12
+	JLE  offset_ok_1_encodeBetterBlockAsm
+	MOVQ (DX)(R10*1), SI
+	CMPQ SI, R9
+
+offset_ok_1_encodeBetterBlockAsm:
+	MOVL  AX, R13
+	SUBL  16(SP), R13
+	MOVQ  (DX)(R13*1), R13
+	MOVQ  $0x000000ffffffff00, R14
+	XORQ  R9, R13
+	TESTQ R14, R13
+	JNE   no_repeat_found_encodeBetterBlockAsm
+	LEAL  1(AX), DI
+	MOVL  12(SP), R8
+	MOVL  DI, R9
+	SUBL  16(SP), R9
+	JZ    repeat_extend_back_end_encodeBetterBlockAsm
+
+repeat_extend_back_loop_encodeBetterBlockAsm:
+	CMPL DI, R8
+	JBE  repeat_extend_back_end_encodeBetterBlockAsm
+	MOVB -1(DX)(R9*1), R10
+	MOVB -1(DX)(DI*1), R11
+	CMPB R10, R11
+	JNE  repeat_extend_back_end_encodeBetterBlockAsm
+	LEAL -1(DI), DI
+	DECL R9
+	JNZ  repeat_extend_back_loop_encodeBetterBlockAsm
+
+repeat_extend_back_end_encodeBetterBlockAsm:
+	MOVL DI, R8
+	SUBL 12(SP), R8
+	LEAQ 4(CX)(R8*1), R8
+	CMPQ R8, (SP)
+	JB   repeat_dst_size_check_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBetterBlockAsm:
+	// emitLiteralsDstP
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_repeat_emit_encodeBetterBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), R10
+	SUBL R8, R9
+
+	// emitLiteral
+	LEAL -1(R9), R8
+	CMPL R8, $0x1d
+	JB   one_byte_repeat_emit_encodeBetterBlockAsm
+	SUBL $0x1d, R8
+	CMPL R8, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBetterBlockAsm
+	CMPL R8, $0x00010000
+	JB   three_bytes_repeat_emit_encodeBetterBlockAsm
+	MOVL R8, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (CX)
+	MOVW R8, 1(CX)
+	MOVB R11, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R8
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm
+
+three_bytes_repeat_emit_encodeBetterBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R8
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm
+
+two_bytes_repeat_emit_encodeBetterBlockAsm:
+	MOVB $0xe8, (CX)
+	MOVB R8, 1(CX)
+	ADDL $0x1d, R8
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_midrepeat_emit_encodeBetterBlockAsm
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm
+
+one_byte_repeat_emit_encodeBetterBlockAsm:
+	SHLB $0x03, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVOU (R10), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_repeat_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_repeat_emit_encodeBetterBlockAsm:
+	MOVQ R8, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm
+
+memmove_midrepeat_emit_encodeBetterBlockAsm:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm:
+	MOVQ R8, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm
+
+memmove_long_repeat_emit_encodeBetterBlockAsm:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R13
+	SUBQ  R11, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R13*1), R11
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R13*1), X4
+	MOVOU -16(R10)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R9, R13
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_repeat_emit_encodeBetterBlockAsm:
+	ADDL $0x05, AX
+	MOVL AX, R8
+	SUBL 16(SP), R8
+	MOVQ src_len+32(FP), R9
+	SUBL AX, R9
+	LEAQ (DX)(AX*1), R10
+	LEAQ (DX)(R8*1), R8
+
+	// matchLen
+	XORL R12, R12
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm
+
+matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm:
+	MOVQ (R10)(R12*1), R11
+	MOVQ 8(R10)(R12*1), R13
+	XORQ (R8)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm
+	XORQ 8(R8)(R12*1), R13
+	JNZ  matchlen_bsf_16repeat_extend_encodeBetterBlockAsm
+	LEAL -16(R9), R9
+	LEAL 16(R12), R12
+
+matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm:
+	CMPL R9, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm
+	JMP  matchlen_match8_repeat_extend_encodeBetterBlockAsm
+
+matchlen_bsf_16repeat_extend_encodeBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R13, R13
+
+#else
+	BSFQ R13, R13
+
+#endif
+	SARQ $0x03, R13
+	LEAL 8(R12)(R13*1), R12
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm
+
+matchlen_match8_repeat_extend_encodeBetterBlockAsm:
+	CMPL R9, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBetterBlockAsm
+	MOVQ (R10)(R12*1), R11
+	XORQ (R8)(R12*1), R11
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm
+	LEAL -8(R9), R9
+	LEAL 8(R12), R12
+	JMP  matchlen_match4_repeat_extend_encodeBetterBlockAsm
+
+matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL (R12)(R11*1), R12
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm
+
+matchlen_match4_repeat_extend_encodeBetterBlockAsm:
+	CMPL R9, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBetterBlockAsm
+	MOVL (R10)(R12*1), R11
+	CMPL (R8)(R12*1), R11
+	JNE  matchlen_match2_repeat_extend_encodeBetterBlockAsm
+	LEAL -4(R9), R9
+	LEAL 4(R12), R12
+
+matchlen_match2_repeat_extend_encodeBetterBlockAsm:
+	CMPL R9, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBetterBlockAsm
+	JB   repeat_extend_forward_end_encodeBetterBlockAsm
+	MOVW (R10)(R12*1), R11
+	CMPW (R8)(R12*1), R11
+	JNE  matchlen_match1_repeat_extend_encodeBetterBlockAsm
+	LEAL 2(R12), R12
+	SUBL $0x02, R9
+	JZ   repeat_extend_forward_end_encodeBetterBlockAsm
+
+matchlen_match1_repeat_extend_encodeBetterBlockAsm:
+	MOVB (R10)(R12*1), R11
+	CMPB (R8)(R12*1), R11
+	JNE  repeat_extend_forward_end_encodeBetterBlockAsm
+	LEAL 1(R12), R12
+
+repeat_extend_forward_end_encodeBetterBlockAsm:
+	ADDL R12, AX
+	MOVL AX, R8
+	SUBL DI, R8
+	MOVL 16(SP), DI
+
+	// emitRepeat
+	LEAL -1(R8), DI
+	CMPL R8, $0x1d
+	JBE  repeat_one_match_repeat_encodeBetterBlockAsm
+	LEAL -30(R8), DI
+	CMPL R8, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBetterBlockAsm
+	CMPL R8, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBetterBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL DI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm
+
+repeat_three_match_repeat_encodeBetterBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW DI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm
+
+repeat_two_match_repeat_encodeBetterBlockAsm:
+	MOVB $0xec, (CX)
+	MOVB DI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm
+
+repeat_one_match_repeat_encodeBetterBlockAsm:
+	XORL DI, DI
+	LEAL -4(DI)(R8*8), DI
+	MOVB DI, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBetterBlockAsm:
+	MOVL AX, 12(SP)
+	JMP  search_loop_encodeBetterBlockAsm
+
+no_repeat_found_encodeBetterBlockAsm:
+	CMPL R8, R12
+	JLE  offset_ok_2_encodeBetterBlockAsm
+	CMPL BX, R9
+	JEQ  candidate_match_encodeBetterBlockAsm
+
+offset_ok_2_encodeBetterBlockAsm:
+	CMPL R10, R12
+	JLE  offset_ok_3_encodeBetterBlockAsm
+	CMPL SI, R9
+	JEQ  candidateS_match_encodeBetterBlockAsm
+
+offset_ok_3_encodeBetterBlockAsm:
+	MOVL 20(SP), AX
+	JMP  search_loop_encodeBetterBlockAsm
+
+candidateS_match_encodeBetterBlockAsm:
+	SHRQ  $0x08, R9
+	MOVQ  R9, R13
+	SHLQ  $0x08, R13
+	IMULQ R11, R13
+	SHRQ  $0x2f, R13
+	MOVL  (DI)(R13*4), R8
+	INCL  AX
+	MOVL  AX, (DI)(R13*4)
+	CMPL  R8, R12
+	JLE   offset_ok_4_encodeBetterBlockAsm
+	CMPL  (DX)(R8*1), R9
+	JEQ   candidate_match_encodeBetterBlockAsm
+
+offset_ok_4_encodeBetterBlockAsm:
+	DECL AX
+	MOVL R10, R8
+
+candidate_match_encodeBetterBlockAsm:
+	MOVL  12(SP), DI
+	TESTL R8, R8
+	JZ    match_extend_back_end_encodeBetterBlockAsm
+
+match_extend_back_loop_encodeBetterBlockAsm:
+	CMPL AX, DI
+	JBE  match_extend_back_end_encodeBetterBlockAsm
+	MOVB -1(DX)(R8*1), R9
+	MOVB -1(DX)(AX*1), R10
+	CMPB R9, R10
+	JNE  match_extend_back_end_encodeBetterBlockAsm
+	LEAL -1(AX), AX
+	DECL R8
+	JZ   match_extend_back_end_encodeBetterBlockAsm
+	JMP  match_extend_back_loop_encodeBetterBlockAsm
+
+match_extend_back_end_encodeBetterBlockAsm:
+	MOVL AX, DI
+	SUBL 12(SP), DI
+	LEAQ 4(CX)(DI*1), DI
+	CMPQ DI, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm:
+	MOVL AX, DI
+	ADDL $0x04, AX
+	ADDL $0x04, R8
+	MOVQ src_len+32(FP), R9
+	SUBL AX, R9
+	LEAQ (DX)(AX*1), R10
+	LEAQ (DX)(R8*1), R11
+
+	// matchLen
+	XORL R13, R13
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm:
+	MOVQ (R10)(R13*1), R12
+	MOVQ 8(R10)(R13*1), R14
+	XORQ (R11)(R13*1), R12
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
+	XORQ 8(R11)(R13*1), R14
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm
+	LEAL -16(R9), R9
+	LEAL 16(R13), R13
+
+matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm:
+	CMPL R9, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm
+	JMP  matchlen_match8_match_nolit_encodeBetterBlockAsm
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R14, R14
+
+#else
+	BSFQ R14, R14
+
+#endif
+	SARQ $0x03, R14
+	LEAL 8(R13)(R14*1), R13
+	JMP  match_nolit_end_encodeBetterBlockAsm
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm:
+	CMPL R9, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm
+	MOVQ (R10)(R13*1), R12
+	XORQ (R11)(R13*1), R12
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
+	LEAL -8(R9), R9
+	LEAL 8(R13), R13
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL (R13)(R12*1), R13
+	JMP  match_nolit_end_encodeBetterBlockAsm
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm:
+	CMPL R9, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm
+	MOVL (R10)(R13*1), R12
+	CMPL (R11)(R13*1), R12
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm
+	LEAL -4(R9), R9
+	LEAL 4(R13), R13
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm:
+	CMPL R9, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm
+	JB   match_nolit_end_encodeBetterBlockAsm
+	MOVW (R10)(R13*1), R12
+	CMPW (R11)(R13*1), R12
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm
+	LEAL 2(R13), R13
+	SUBL $0x02, R9
+	JZ   match_nolit_end_encodeBetterBlockAsm
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm:
+	MOVB (R10)(R13*1), R12
+	CMPB (R11)(R13*1), R12
+	JNE  match_nolit_end_encodeBetterBlockAsm
+	LEAL 1(R13), R13
+
+match_nolit_end_encodeBetterBlockAsm:
+	MOVL AX, R9
+	SUBL R8, R9
+	CMPL R13, $0x01
+	JA   match_length_ok_encodeBetterBlockAsm
+	CMPL R9, $0x0001003f
+	JBE  match_length_ok_encodeBetterBlockAsm
+	MOVL 20(SP), AX
+	INCL AX
+	JMP  search_loop_encodeBetterBlockAsm
+
+match_length_ok_encodeBetterBlockAsm:
+	MOVL R9, 16(SP)
+
+	// Check if we can combine lit+copy
+	MOVLQZX 12(SP), R10
+	MOVL    DI, R8
+	SUBL    R10, R8
+	JZ      match_emit_nolits_encodeBetterBlockAsm
+	CMPL    R9, $0x00000040
+	JL      match_emit_lits_encodeBetterBlockAsm
+	CMPL    R9, $0x0001003f
+	JA      match_emit_copy3_encodeBetterBlockAsm
+	CMPL    R8, $0x04
+	JA      match_emit_lits_encodeBetterBlockAsm
+	MOVL    (DX)(R10*1), R10
+	ADDL    R13, AX
+	ADDL    $0x04, R13
+	MOVL    AX, 12(SP)
+
+	// emitCopy2WithLits
+	XORQ    R11, R11
+	SUBL    $0x40, R9
+	LEAL    -11(R13), R12
+	LEAL    -4(R13), R13
+	MOVW    R9, 1(CX)
+	CMPL    R13, $0x07
+	CMOVLGE R12, R11
+	MOVQ    $0x00000007, R9
+	CMOVLLT R13, R9
+	LEAL    -1(R8)(R9*4), R9
+	MOVL    $0x00000003, R12
+	LEAL    (R12)(R9*8), R9
+	MOVB    R9, (CX)
+	ADDQ    $0x03, CX
+	MOVL    R10, (CX)
+	ADDQ    R8, CX
+	TESTL   R11, R11
+	JZ      match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+	// emitRepeat
+	LEAL -1(R11), R8
+	CMPL R11, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm
+	LEAL -30(R11), R8
+	CMPL R11, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm
+	CMPL R11, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL R8, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm:
+	MOVB $0xec, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm:
+	XORL R8, R8
+	LEAL -4(R8)(R11*8), R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+match_emit_copy3_encodeBetterBlockAsm:
+	CMPL    R8, $0x03
+	JA      match_emit_lits_encodeBetterBlockAsm
+	MOVLQZX 12(SP), R10
+	MOVL    (DX)(R10*1), R10
+	ADDL    R13, AX
+	ADDL    $0x04, R13
+	MOVL    AX, 12(SP)
+
+	// emitCopy3
+	LEAL -4(R13), R13
+	LEAL -65536(R9), R9
+	SHLL $0x0b, R9
+	LEAL 7(R9)(R8*8), R9
+	CMPL R13, $0x3c
+	JBE  emit_copy3_0_match_emit_lits_encodeBetterBlockAsm
+	LEAL -60(R13), R11
+	CMPL R13, $0x0000013c
+	JB   emit_copy3_1_match_emit_lits_encodeBetterBlockAsm
+	CMPL R13, $0x0001003c
+	JB   emit_copy3_2_match_emit_lits_encodeBetterBlockAsm
+	ADDL $0x000007e0, R9
+	MOVL R9, (CX)
+	MOVL R11, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_emit_copy_litsencodeBetterBlockAsm
+
+emit_copy3_2_match_emit_lits_encodeBetterBlockAsm:
+	ADDL $0x000007c0, R9
+	MOVL R9, (CX)
+	MOVW R11, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_emit_copy_litsencodeBetterBlockAsm
+
+emit_copy3_1_match_emit_lits_encodeBetterBlockAsm:
+	ADDL $0x000007a0, R9
+	MOVL R9, (CX)
+	MOVB R11, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_emit_copy_litsencodeBetterBlockAsm
+
+emit_copy3_0_match_emit_lits_encodeBetterBlockAsm:
+	SHLL $0x05, R13
+	ORL  R13, R9
+	MOVL R9, (CX)
+	ADDQ $0x04, CX
+
+match_emit_copy_litsencodeBetterBlockAsm:
+	MOVL R10, (CX)
+	ADDQ R8, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+match_emit_lits_encodeBetterBlockAsm:
+	LEAQ (DX)(R10*1), R10
+
+	// emitLiteral
+	LEAL -1(R8), R11
+	CMPL R11, $0x1d
+	JB   one_byte_match_emit_encodeBetterBlockAsm
+	SUBL $0x1d, R11
+	CMPL R11, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm
+	CMPL R11, $0x00010000
+	JB   three_bytes_match_emit_encodeBetterBlockAsm
+	MOVL R11, R12
+	SHRL $0x10, R12
+	MOVB $0xf8, (CX)
+	MOVW R11, 1(CX)
+	MOVB R12, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R11
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+three_bytes_match_emit_encodeBetterBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVW R11, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R11
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+two_bytes_match_emit_encodeBetterBlockAsm:
+	MOVB $0xe8, (CX)
+	MOVB R11, 1(CX)
+	ADDL $0x1d, R11
+	ADDQ $0x02, CX
+	CMPL R11, $0x40
+	JB   memmove_midmatch_emit_encodeBetterBlockAsm
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm
+
+one_byte_match_emit_encodeBetterBlockAsm:
+	SHLB $0x03, R11
+	MOVB R11, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(R8*1), R11
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ R8, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVOU (R10), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R8*1), X2
+	MOVOU -16(R10)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm:
+	MOVQ R11, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm
+
+memmove_midmatch_emit_encodeBetterBlockAsm:
+	LEAQ (CX)(R8*1), R11
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ R8, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R8*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R8*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBetterBlockAsm
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R8*1), X2
+	MOVOU -16(R10)(R8*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+
+memmove_mid_end_copy_match_emit_encodeBetterBlockAsm:
+	MOVQ R11, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm
+
+memmove_long_match_emit_encodeBetterBlockAsm:
+	LEAQ (CX)(R8*1), R11
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R8*1), X2
+	MOVOU -16(R10)(R8*1), X3
+	MOVQ  R8, R14
+	SHRQ  $0x05, R14
+	MOVQ  CX, R12
+	ANDL  $0x0000001f, R12
+	MOVQ  $0x00000040, R15
+	SUBQ  R12, R15
+	DECQ  R14
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R15*1), R12
+	LEAQ  -32(CX)(R15*1), BP
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R12), X4
+	MOVOU 16(R12), X5
+	MOVOA X4, (BP)
+	MOVOA X5, 16(BP)
+	ADDQ  $0x20, BP
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R15
+	DECQ  R14
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R15*1), X4
+	MOVOU -16(R10)(R15*1), X5
+	MOVOA X4, -32(CX)(R15*1)
+	MOVOA X5, -16(CX)(R15*1)
+	ADDQ  $0x20, R15
+	CMPQ  R8, R15
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R8*1)
+	MOVOU X3, -16(CX)(R8*1)
+	MOVQ  R11, CX
+
+match_emit_nolits_encodeBetterBlockAsm:
+	ADDL R13, AX
+	ADDL $0x04, R13
+	MOVL AX, 12(SP)
+
+	// emitCopy
+	CMPL R9, $0x0001003f
+	JBE  two_byte_offset_match_nolit_encodeBetterBlockAsm
+
+	// emitCopy3
+	LEAL -4(R13), R13
+	LEAL -65536(R9), R8
+	SHLL $0x0b, R8
+	ADDL $0x07, R8
+	CMPL R13, $0x3c
+	JBE  emit_copy3_0_match_nolit_encodeBetterBlockAsm_emit3
+	LEAL -60(R13), R9
+	CMPL R13, $0x0000013c
+	JB   emit_copy3_1_match_nolit_encodeBetterBlockAsm_emit3
+	CMPL R13, $0x0001003c
+	JB   emit_copy3_2_match_nolit_encodeBetterBlockAsm_emit3
+	ADDL $0x000007e0, R8
+	MOVL R8, (CX)
+	MOVL R9, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy3_2_match_nolit_encodeBetterBlockAsm_emit3:
+	ADDL $0x000007c0, R8
+	MOVL R8, (CX)
+	MOVW R9, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy3_1_match_nolit_encodeBetterBlockAsm_emit3:
+	ADDL $0x000007a0, R8
+	MOVL R8, (CX)
+	MOVB R9, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy3_0_match_nolit_encodeBetterBlockAsm_emit3:
+	SHLL $0x05, R13
+	ORL  R13, R8
+	MOVL R8, (CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm:
+	CMPL R9, $0x00000400
+	JA   two_byte_match_nolit_encodeBetterBlockAsm
+	CMPL R13, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBetterBlockAsm
+	LEAL -1(R9), R8
+	SHLL $0x06, R8
+	LEAL -15(R8)(R13*4), R8
+	MOVW R8, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_one_longer_match_nolit_encodeBetterBlockAsm:
+	CMPL R13, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBetterBlockAsm
+	LEAL -1(R9), R8
+	SHLL $0x06, R8
+	LEAL 61(R8), R8
+	MOVW R8, (CX)
+	LEAL -18(R13), R8
+	MOVB R8, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy1_repeat_match_nolit_encodeBetterBlockAsm:
+	LEAL -1(R9), R8
+	SHLL $0x06, R8
+	LEAL 57(R8), R8
+	MOVW R8, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R13
+
+	// emitRepeat
+	LEAL -1(R13), R8
+	CMPL R13, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm
+	LEAL -30(R13), R8
+	CMPL R13, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm
+	CMPL R13, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL R8, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm:
+	MOVB $0xec, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm:
+	XORL R8, R8
+	LEAL -4(R8)(R13*8), R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+two_byte_match_nolit_encodeBetterBlockAsm:
+	// emitCopy2
+	LEAL -64(R9), R9
+	LEAL -4(R13), R13
+	MOVW R9, 1(CX)
+	CMPL R13, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBetterBlockAsm_emit2
+	LEAL -60(R13), R8
+	CMPL R13, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBetterBlockAsm_emit2
+	CMPL R13, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBetterBlockAsm_emit2
+	MOVB $0xfe, (CX)
+	MOVL R8, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy2_2_match_nolit_encodeBetterBlockAsm_emit2:
+	MOVB $0xfa, (CX)
+	MOVW R8, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy2_1_match_nolit_encodeBetterBlockAsm_emit2:
+	MOVB $0xf6, (CX)
+	MOVB R8, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+emit_copy2_0_match_nolit_encodeBetterBlockAsm_emit2:
+	MOVL $0x00000002, R8
+	LEAL (R8)(R13*4), R8
+	MOVB R8, (CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+	// emitLiteralsDstP
+	MOVL 12(SP), R8
+	CMPL R8, DI
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+	MOVL DI, R9
+	MOVL DI, 12(SP)
+	LEAQ (DX)(R8*1), R10
+	SUBL R8, R9
+
+	// emitLiteral
+	LEAL -1(R9), R8
+	CMPL R8, $0x1d
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm
+	SUBL $0x1d, R8
+	CMPL R8, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm
+	CMPL R8, $0x00010000
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm
+	MOVL R8, R11
+	SHRL $0x10, R11
+	MOVB $0xf8, (CX)
+	MOVW R8, 1(CX)
+	MOVB R11, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R8
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R8
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm:
+	MOVB $0xe8, (CX)
+	MOVB R8, 1(CX)
+	ADDL $0x1d, R8
+	ADDQ $0x02, CX
+	CMPL R8, $0x40
+	JB   memmove_midmatch_emit_repeat_encodeBetterBlockAsm
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm:
+	SHLB $0x03, R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVOU (R10), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+
+memmove_midmatch_emit_repeat_encodeBetterBlockAsm:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (R10), X0
+	MOVOU -16(R10)(R9*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(R9*1)
+	JMP   memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+
+memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm:
+	MOVQ R8, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm:
+	LEAQ (CX)(R9*1), R8
+
+	// genMemMoveLong
+	MOVOU (R10), X0
+	MOVOU 16(R10), X1
+	MOVOU -32(R10)(R9*1), X2
+	MOVOU -16(R10)(R9*1), X3
+	MOVQ  R9, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R11
+	ANDL  $0x0000001f, R11
+	MOVQ  $0x00000040, R14
+	SUBQ  R11, R14
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(R10)(R14*1), R11
+	LEAQ  -32(CX)(R14*1), R15
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (R11), X4
+	MOVOU 16(R11), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, R14
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(R10)(R14*1), X4
+	MOVOU -16(R10)(R14*1), X5
+	MOVOA X4, -32(CX)(R14*1)
+	MOVOA X5, -16(CX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(R9*1)
+	MOVOU X3, -16(CX)(R9*1)
+	MOVQ  R8, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
+	ADDL R13, AX
+	ADDL $0x04, R13
+	MOVL AX, 12(SP)
+
+	// emitRepeat
+	LEAL -1(R13), R8
+	CMPL R13, $0x1d
+	JBE  repeat_one_match_nolit_repeat_encodeBetterBlockAsm
+	LEAL -30(R13), R8
+	CMPL R13, $0x0000011e
+	JB   repeat_two_match_nolit_repeat_encodeBetterBlockAsm
+	CMPL R13, $0x0001001e
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm
+	MOVB $0xfc, (CX)
+	MOVL R8, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
+	MOVB $0xf4, (CX)
+	MOVW R8, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
+	MOVB $0xec, (CX)
+	MOVB R8, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
+
+repeat_one_match_nolit_repeat_encodeBetterBlockAsm:
+	XORL R8, R8
+	LEAL -4(R8)(R13*8), R8
+	MOVB R8, (CX)
+	ADDQ $0x01, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm:
+	CMPL AX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm:
+	MOVQ  tmp+48(FP), R8
+	MOVQ  $0x00cf1bbcdcbfa563, R9
+	MOVQ  $0x9e3779b1, R10
+	LEAQ  1(DI), DI
+	LEAQ  -2(AX), R11
+	MOVQ  (DX)(DI*1), R12
+	MOVQ  1(DX)(DI*1), R13
+	MOVQ  (DX)(R11*1), R14
+	MOVQ  1(DX)(R11*1), R15
+	SHLQ  $0x08, R12
+	IMULQ R9, R12
+	SHRQ  $0x2f, R12
+	SHLQ  $0x20, R13
+	IMULQ R10, R13
+	SHRQ  $0x32, R13
+	SHLQ  $0x08, R14
+	IMULQ R9, R14
+	SHRQ  $0x2f, R14
+	SHLQ  $0x20, R15
+	IMULQ R10, R15
+	SHRQ  $0x32, R15
+	LEAQ  1(DI), R10
+	LEAQ  1(R11), BP
+	MOVL  DI, (R8)(R12*4)
+	MOVL  R11, (R8)(R14*4)
+	LEAQ  1(R11)(DI*1), R12
+	SHRQ  $0x01, R12
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R11
+	MOVL  R10, 524288(R8)(R13*4)
+	MOVL  BP, 524288(R8)(R15*4)
+
+index_loop_encodeBetterBlockAsm:
+	CMPQ  R12, R11
+	JAE   search_loop_encodeBetterBlockAsm
+	MOVQ  (DX)(DI*1), R10
+	MOVQ  (DX)(R12*1), R13
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x08, R13
+	IMULQ R9, R13
+	SHRQ  $0x2f, R13
+	MOVL  DI, (R8)(R10*4)
+	MOVL  R11, (R8)(R13*4)
+	ADDQ  $0x02, DI
+	ADDQ  $0x02, R12
+	JMP   index_loop_encodeBetterBlockAsm
+
+emit_remainder_encodeBetterBlockAsm:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 4(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm:
+	MOVQ src_len+32(FP), AX
+
+	// emitLiteralsDstP
+	MOVL 12(SP), BX
+	CMPL BX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (DX)(BX*1), AX
+	SUBL BX, SI
+
+	// emitLiteral
+	LEAL -1(SI), DX
+	CMPL DX, $0x1d
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm
+	SUBL $0x1d, DX
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (CX)
+	MOVW DX, 1(CX)
+	MOVB BL, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, DX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+three_bytes_emit_remainder_encodeBetterBlockAsm:
+	MOVB $0xf0, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, DX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+two_bytes_emit_remainder_encodeBetterBlockAsm:
+	MOVB $0xe8, (CX)
+	MOVB DL, 1(CX)
+	ADDL $0x1d, DX
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_midemit_remainder_encodeBetterBlockAsm
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
+
+one_byte_emit_remainder_encodeBetterBlockAsm:
+	SHLB $0x03, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -1, min move: 1
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
+	CMPQ BX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through8:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm
+
+memmove_midemit_remainder_encodeBetterBlockAsm:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -2, min move: 30
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm
+
+memmove_long_emit_remainder_encodeBetterBlockAsm:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm2MB(dst []byte, src []byte, tmp *[589824]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBetterBlockAsm2MB(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00001200, DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm2MB:
+	MOVOU X0, (AX)
+	MOVOU X0, 16(AX)
+	MOVOU X0, 32(AX)
+	MOVOU X0, 48(AX)
+	MOVOU X0, 64(AX)
+	MOVOU X0, 80(AX)
+	MOVOU X0, 96(AX)
+	MOVOU X0, 112(AX)
+	ADDQ  $0x80, AX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm2MB
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), AX
+	LEAQ  -17(AX), DX
+	LEAQ  -17(AX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, AX
+	SUBL  AX, DX
+	LEAQ  (CX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, AX
+	MOVL  AX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm2MB:
+	MOVQ tmp+48(FP), BX
+	MOVL AX, SI
+	SUBL 12(SP), SI
+	SHRL $0x07, SI
+	CMPL SI, $0x63
+	JBE  check_maxskip_ok_encodeBetterBlockAsm2MB
+	LEAL 100(AX), SI
+	JMP  check_maxskip_cont_encodeBetterBlockAsm2MB
+
+check_maxskip_ok_encodeBetterBlockAsm2MB:
+	LEAL 1(AX)(SI*1), SI
+
+check_maxskip_cont_encodeBetterBlockAsm2MB:
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm2MB
+	MOVQ  (DX)(AX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x32, R11
+	MOVL  (BX)(R10*4), SI
+	MOVL  524288(BX)(R11*4), R8
+	MOVL  AX, (BX)(R10*4)
+	MOVL  AX, 524288(BX)(R11*4)
+	MOVQ  (DX)(SI*1), R10
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeBetterBlockAsm2MB
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R11, DI
+	MOVL  AX, R12
+	SUBL  16(SP), R12
+	MOVQ  (DX)(R12*1), R12
+	MOVQ  $0x000000ffffffff00, R13
+	XORQ  DI, R12
+	TESTQ R13, R12
+	JNE   no_repeat_found_encodeBetterBlockAsm2MB
+	LEAL  1(AX), BX
+	MOVL  12(SP), SI
+	MOVL  BX, DI
+	SUBL  16(SP), DI
+	JZ    repeat_extend_back_end_encodeBetterBlockAsm2MB
+
+repeat_extend_back_loop_encodeBetterBlockAsm2MB:
+	CMPL BX, SI
+	JBE  repeat_extend_back_end_encodeBetterBlockAsm2MB
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(BX*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeBetterBlockAsm2MB
+	LEAL -1(BX), BX
+	DECL DI
+	JNZ  repeat_extend_back_loop_encodeBetterBlockAsm2MB
+
+repeat_extend_back_end_encodeBetterBlockAsm2MB:
+	MOVL BX, SI
+	SUBL 12(SP), SI
+	LEAQ 4(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeBetterBlockAsm2MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBetterBlockAsm2MB:
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_repeat_emit_encodeBetterBlockAsm2MB
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBetterBlockAsm2MB
+	CMPL SI, $0x00010000
+	JB   three_bytes_repeat_emit_encodeBetterBlockAsm2MB
+	MOVL SI, R9
+	SHRL $0x10, R9
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R9, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm2MB
+
+three_bytes_repeat_emit_encodeBetterBlockAsm2MB:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm2MB
+
+two_bytes_repeat_emit_encodeBetterBlockAsm2MB:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midrepeat_emit_encodeBetterBlockAsm2MB
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm2MB
+
+one_byte_repeat_emit_encodeBetterBlockAsm2MB:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_8through16:
+	MOVOU (R8), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_repeat_emit_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_repeat_emit_encodeBetterBlockAsm2MB:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB
+
+memmove_midrepeat_emit_encodeBetterBlockAsm2MB:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm2MB:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB
+
+memmove_long_repeat_emit_encodeBetterBlockAsm2MB:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), R9
+	LEAQ  -32(CX)(R11*1), R12
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(CX)(R11*1)
+	MOVOA X5, -16(CX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DI, R11
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB:
+	ADDL $0x05, AX
+	MOVL AX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm2MB
+
+matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm2MB:
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm2MB
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_encodeBetterBlockAsm2MB
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+
+matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm2MB:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm2MB
+	JMP  matchlen_match8_repeat_extend_encodeBetterBlockAsm2MB
+
+matchlen_bsf_16repeat_extend_encodeBetterBlockAsm2MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm2MB
+
+matchlen_match8_repeat_extend_encodeBetterBlockAsm2MB:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBetterBlockAsm2MB
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm2MB
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_encodeBetterBlockAsm2MB
+
+matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm2MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm2MB
+
+matchlen_match4_repeat_extend_encodeBetterBlockAsm2MB:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBetterBlockAsm2MB
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_encodeBetterBlockAsm2MB
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_encodeBetterBlockAsm2MB:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBetterBlockAsm2MB
+	JB   repeat_extend_forward_end_encodeBetterBlockAsm2MB
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_encodeBetterBlockAsm2MB
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_encodeBetterBlockAsm2MB
+
+matchlen_match1_repeat_extend_encodeBetterBlockAsm2MB:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  repeat_extend_forward_end_encodeBetterBlockAsm2MB
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_encodeBetterBlockAsm2MB:
+	ADDL R10, AX
+	MOVL AX, SI
+	SUBL BX, SI
+	MOVL 16(SP), BX
+
+	// emitRepeat
+	LEAL -1(SI), BX
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBetterBlockAsm2MB
+	LEAL -30(SI), BX
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBetterBlockAsm2MB
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBetterBlockAsm2MB
+	MOVB $0xfc, (CX)
+	MOVL BX, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm2MB
+
+repeat_three_match_repeat_encodeBetterBlockAsm2MB:
+	MOVB $0xf4, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm2MB
+
+repeat_two_match_repeat_encodeBetterBlockAsm2MB:
+	MOVB $0xec, (CX)
+	MOVB BL, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm2MB
+
+repeat_one_match_repeat_encodeBetterBlockAsm2MB:
+	XORL BX, BX
+	LEAL -4(BX)(SI*8), BX
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBetterBlockAsm2MB:
+	MOVL AX, 12(SP)
+	JMP  search_loop_encodeBetterBlockAsm2MB
+
+no_repeat_found_encodeBetterBlockAsm2MB:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm2MB
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm2MB
+	MOVL 20(SP), AX
+	JMP  search_loop_encodeBetterBlockAsm2MB
+
+candidateS_match_encodeBetterBlockAsm2MB:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x2f, R10
+	MOVL  (BX)(R10*4), SI
+	INCL  AX
+	MOVL  AX, (BX)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm2MB
+	DECL  AX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm2MB:
+	MOVL  12(SP), BX
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm2MB
+
+match_extend_back_loop_encodeBetterBlockAsm2MB:
+	CMPL AX, BX
+	JBE  match_extend_back_end_encodeBetterBlockAsm2MB
+	MOVB -1(DX)(SI*1), DI
+	MOVB -1(DX)(AX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm2MB
+	LEAL -1(AX), AX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm2MB
+	JMP  match_extend_back_loop_encodeBetterBlockAsm2MB
+
+match_extend_back_end_encodeBetterBlockAsm2MB:
+	MOVL AX, BX
+	SUBL 12(SP), BX
+	LEAQ 4(CX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm2MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm2MB:
+	MOVL AX, BX
+	ADDL $0x04, AX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), R9
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm2MB
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm2MB:
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm2MB
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm2MB
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm2MB:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm2MB
+	JMP  matchlen_match8_match_nolit_encodeBetterBlockAsm2MB
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm2MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm2MB
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm2MB:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm2MB
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm2MB
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm2MB
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm2MB:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm2MB
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm2MB:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm2MB
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm2MB
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm2MB:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm2MB
+	JB   match_nolit_end_encodeBetterBlockAsm2MB
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm2MB
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBetterBlockAsm2MB
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm2MB:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeBetterBlockAsm2MB
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBetterBlockAsm2MB:
+	MOVL AX, DI
+	SUBL SI, DI
+	CMPL R11, $0x01
+	JA   match_length_ok_encodeBetterBlockAsm2MB
+	CMPL DI, $0x0001003f
+	JBE  match_length_ok_encodeBetterBlockAsm2MB
+	MOVL 20(SP), AX
+	INCL AX
+	JMP  search_loop_encodeBetterBlockAsm2MB
+
+match_length_ok_encodeBetterBlockAsm2MB:
+	MOVL DI, 16(SP)
+
+	// Check if we can combine lit+copy
+	MOVLQZX 12(SP), R8
+	MOVL    BX, SI
+	SUBL    R8, SI
+	JZ      match_emit_nolits_encodeBetterBlockAsm2MB
+	CMPL    DI, $0x00000040
+	JL      match_emit_lits_encodeBetterBlockAsm2MB
+	CMPL    DI, $0x0001003f
+	JA      match_emit_copy3_encodeBetterBlockAsm2MB
+	CMPL    SI, $0x04
+	JA      match_emit_lits_encodeBetterBlockAsm2MB
+	MOVL    (DX)(R8*1), R8
+	ADDL    R11, AX
+	ADDL    $0x04, R11
+	MOVL    AX, 12(SP)
+
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, DI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    DI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, DI
+	CMOVLLT R11, DI
+	LEAL    -1(SI)(DI*4), DI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(DI*8), DI
+	MOVB    DI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    R8, (CX)
+	ADDQ    SI, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm2MB
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm2MB
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm2MB
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm2MB:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm2MB:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm2MB:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+match_emit_copy3_encodeBetterBlockAsm2MB:
+	CMPL    SI, $0x03
+	JA      match_emit_lits_encodeBetterBlockAsm2MB
+	MOVLQZX 12(SP), R8
+	MOVL    (DX)(R8*1), R8
+	ADDL    R11, AX
+	ADDL    $0x04, R11
+	MOVL    AX, 12(SP)
+
+	// emitCopy3
+	LEAL -4(R11), R11
+	LEAL -65536(DI), DI
+	SHLL $0x0b, DI
+	LEAL 7(DI)(SI*8), DI
+	CMPL R11, $0x3c
+	JBE  emit_copy3_0_match_emit_lits_encodeBetterBlockAsm2MB
+	LEAL -60(R11), R9
+	CMPL R11, $0x0000013c
+	JB   emit_copy3_1_match_emit_lits_encodeBetterBlockAsm2MB
+	CMPL R11, $0x0001003c
+	JB   emit_copy3_2_match_emit_lits_encodeBetterBlockAsm2MB
+	ADDL $0x000007e0, DI
+	MOVL DI, (CX)
+	MOVL R9, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_emit_copy_litsencodeBetterBlockAsm2MB
+
+emit_copy3_2_match_emit_lits_encodeBetterBlockAsm2MB:
+	ADDL $0x000007c0, DI
+	MOVL DI, (CX)
+	MOVW R9, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_emit_copy_litsencodeBetterBlockAsm2MB
+
+emit_copy3_1_match_emit_lits_encodeBetterBlockAsm2MB:
+	ADDL $0x000007a0, DI
+	MOVL DI, (CX)
+	MOVB R9, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_emit_copy_litsencodeBetterBlockAsm2MB
+
+emit_copy3_0_match_emit_lits_encodeBetterBlockAsm2MB:
+	SHLL $0x05, R11
+	ORL  R11, DI
+	MOVL DI, (CX)
+	ADDQ $0x04, CX
+
+match_emit_copy_litsencodeBetterBlockAsm2MB:
+	MOVL R8, (CX)
+	ADDQ SI, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+match_emit_lits_encodeBetterBlockAsm2MB:
+	LEAQ (DX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBetterBlockAsm2MB
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm2MB
+	CMPL R9, $0x00010000
+	JB   three_bytes_match_emit_encodeBetterBlockAsm2MB
+	MOVL R9, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R9, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm2MB
+
+three_bytes_match_emit_encodeBetterBlockAsm2MB:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm2MB
+
+two_bytes_match_emit_encodeBetterBlockAsm2MB:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBetterBlockAsm2MB
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm2MB
+
+one_byte_match_emit_encodeBetterBlockAsm2MB:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_8through16:
+	MOVOU (R8), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm2MB:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm2MB
+
+memmove_midmatch_emit_encodeBetterBlockAsm2MB:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_match_emit_encodeBetterBlockAsm2MB:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm2MB
+
+memmove_long_match_emit_encodeBetterBlockAsm2MB:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R13*1), X4
+	MOVOU -16(R8)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  SI, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+match_emit_nolits_encodeBetterBlockAsm2MB:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitCopy
+	CMPL DI, $0x0001003f
+	JBE  two_byte_offset_match_nolit_encodeBetterBlockAsm2MB
+
+	// emitCopy3
+	LEAL -4(R11), R11
+	LEAL -65536(DI), SI
+	SHLL $0x0b, SI
+	ADDL $0x07, SI
+	CMPL R11, $0x3c
+	JBE  emit_copy3_0_match_nolit_encodeBetterBlockAsm2MB_emit3
+	LEAL -60(R11), DI
+	CMPL R11, $0x0000013c
+	JB   emit_copy3_1_match_nolit_encodeBetterBlockAsm2MB_emit3
+	CMPL R11, $0x0001003c
+	JB   emit_copy3_2_match_nolit_encodeBetterBlockAsm2MB_emit3
+	ADDL $0x000007e0, SI
+	MOVL SI, (CX)
+	MOVL DI, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+emit_copy3_2_match_nolit_encodeBetterBlockAsm2MB_emit3:
+	ADDL $0x000007c0, SI
+	MOVL SI, (CX)
+	MOVW DI, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+emit_copy3_1_match_nolit_encodeBetterBlockAsm2MB_emit3:
+	ADDL $0x000007a0, SI
+	MOVL SI, (CX)
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+emit_copy3_0_match_nolit_encodeBetterBlockAsm2MB_emit3:
+	SHLL $0x05, R11
+	ORL  R11, SI
+	MOVL SI, (CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm2MB:
+	CMPL DI, $0x00000400
+	JA   two_byte_match_nolit_encodeBetterBlockAsm2MB
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBetterBlockAsm2MB
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+emit_one_longer_match_nolit_encodeBetterBlockAsm2MB:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBetterBlockAsm2MB
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+emit_copy1_repeat_match_nolit_encodeBetterBlockAsm2MB:
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm2MB:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+two_byte_match_nolit_encodeBetterBlockAsm2MB:
+	// emitCopy2
+	LEAL -64(DI), DI
+	LEAL -4(R11), R11
+	MOVW DI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBetterBlockAsm2MB_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBetterBlockAsm2MB_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBetterBlockAsm2MB_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+emit_copy2_2_match_nolit_encodeBetterBlockAsm2MB_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+emit_copy2_1_match_nolit_encodeBetterBlockAsm2MB_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+emit_copy2_0_match_nolit_encodeBetterBlockAsm2MB_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm2MB
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm2MB
+	CMPL SI, $0x00010000
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm2MB
+	MOVL SI, R9
+	SHRL $0x10, R9
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R9, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm2MB:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm2MB:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midmatch_emit_repeat_encodeBetterBlockAsm2MB
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm2MB:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_8through16:
+	MOVOU (R8), X0
+	MOVOU X0, (CX)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB
+
+memmove_midmatch_emit_repeat_encodeBetterBlockAsm2MB:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 15, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm2MB:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R12
+	SUBQ  R9, R12
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R9
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R12
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  DI, R12
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm2MB:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_match_nolit_repeat_encodeBetterBlockAsm2MB
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_match_nolit_repeat_encodeBetterBlockAsm2MB
+	CMPL R11, $0x0001001e
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm2MB
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm2MB:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm2MB:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm2MB
+
+repeat_one_match_nolit_repeat_encodeBetterBlockAsm2MB:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm2MB:
+	CMPL AX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm2MB
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm2MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm2MB:
+	MOVQ  tmp+48(FP), SI
+	MOVQ  $0x00cf1bbcdcbfa563, DI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(BX), BX
+	LEAQ  -2(AX), R9
+	MOVQ  (DX)(BX*1), R10
+	MOVQ  1(DX)(BX*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
+	SHLQ  $0x08, R10
+	IMULQ DI, R10
+	SHRQ  $0x2f, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x32, R11
+	SHLQ  $0x08, R12
+	IMULQ DI, R12
+	SHRQ  $0x2f, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x32, R13
+	LEAQ  1(BX), R8
+	LEAQ  1(R9), R14
+	MOVL  BX, (SI)(R10*4)
+	MOVL  R9, (SI)(R12*4)
+	LEAQ  1(R9)(BX*1), R10
+	SHRQ  $0x01, R10
+	ADDQ  $0x01, BX
+	SUBQ  $0x01, R9
+	MOVL  R8, 524288(SI)(R11*4)
+	MOVL  R14, 524288(SI)(R13*4)
+
+index_loop_encodeBetterBlockAsm2MB:
+	CMPQ  R10, R9
+	JAE   search_loop_encodeBetterBlockAsm2MB
+	MOVQ  (DX)(BX*1), R8
+	MOVQ  (DX)(R10*1), R11
+	SHLQ  $0x08, R8
+	IMULQ DI, R8
+	SHRQ  $0x2f, R8
+	SHLQ  $0x08, R11
+	IMULQ DI, R11
+	SHRQ  $0x2f, R11
+	MOVL  BX, (SI)(R8*4)
+	MOVL  R9, (SI)(R11*4)
+	ADDQ  $0x02, BX
+	ADDQ  $0x02, R10
+	JMP   index_loop_encodeBetterBlockAsm2MB
+
+emit_remainder_encodeBetterBlockAsm2MB:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 4(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm2MB
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm2MB:
+	MOVQ src_len+32(FP), AX
+
+	// emitLiteralsDstP
+	MOVL 12(SP), BX
+	CMPL BX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (DX)(BX*1), AX
+	SUBL BX, SI
+
+	// emitLiteral
+	LEAL -1(SI), DX
+	CMPL DX, $0x1d
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm2MB
+	SUBL $0x1d, DX
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm2MB
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm2MB
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (CX)
+	MOVW DX, 1(CX)
+	MOVB BL, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, DX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm2MB
+
+three_bytes_emit_remainder_encodeBetterBlockAsm2MB:
+	MOVB $0xf0, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, DX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm2MB
+
+two_bytes_emit_remainder_encodeBetterBlockAsm2MB:
+	MOVB $0xe8, (CX)
+	MOVB DL, 1(CX)
+	ADDL $0x1d, DX
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_midemit_remainder_encodeBetterBlockAsm2MB
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm2MB
+
+one_byte_emit_remainder_encodeBetterBlockAsm2MB:
+	SHLB $0x03, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -1, min move: 1
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_3
+	CMPQ BX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_4through8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_4through8:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB
+
+memmove_midemit_remainder_encodeBetterBlockAsm2MB:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -2, min move: 30
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm2MB
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm2MB:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB
+
+memmove_long_emit_remainder_encodeBetterBlockAsm2MB:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm2MB:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm512K(dst []byte, src []byte, tmp *[294912]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBetterBlockAsm512K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000900, DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm512K:
+	MOVOU X0, (AX)
+	MOVOU X0, 16(AX)
+	MOVOU X0, 32(AX)
+	MOVOU X0, 48(AX)
+	MOVOU X0, 64(AX)
+	MOVOU X0, 80(AX)
+	MOVOU X0, 96(AX)
+	MOVOU X0, 112(AX)
+	ADDQ  $0x80, AX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm512K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), AX
+	LEAQ  -11(AX), DX
+	LEAQ  -8(AX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, AX
+	SUBL  AX, DX
+	LEAQ  (CX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, AX
+	MOVL  AX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm512K:
+	MOVQ tmp+48(FP), BX
+	MOVL AX, SI
+	SUBL 12(SP), SI
+	SHRL $0x07, SI
+	CMPL SI, $0x63
+	JBE  check_maxskip_ok_encodeBetterBlockAsm512K
+	LEAL 100(AX), SI
+	JMP  check_maxskip_cont_encodeBetterBlockAsm512K
+
+check_maxskip_ok_encodeBetterBlockAsm512K:
+	LEAL 1(AX)(SI*1), SI
+
+check_maxskip_cont_encodeBetterBlockAsm512K:
+	CMPL  SI, 8(SP)
+	JAE   emit_remainder_encodeBetterBlockAsm512K
+	MOVQ  (DX)(AX*1), DI
+	MOVL  SI, 20(SP)
+	MOVQ  $0x00cf1bbcdcbfa563, R9
+	MOVQ  $0x9e3779b1, SI
+	MOVQ  DI, R10
+	MOVQ  DI, R11
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ SI, R11
+	SHRQ  $0x33, R11
+	MOVL  (BX)(R10*4), SI
+	MOVL  262144(BX)(R11*4), R8
+	MOVL  AX, (BX)(R10*4)
+	MOVL  AX, 262144(BX)(R11*4)
+	MOVQ  (DX)(SI*1), R10
+	CMPQ  R10, DI
+	JEQ   candidate_match_encodeBetterBlockAsm512K
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R11, DI
+	MOVL  AX, R12
+	SUBL  16(SP), R12
+	MOVQ  (DX)(R12*1), R12
+	MOVQ  $0x000000ffffffff00, R13
+	XORQ  DI, R12
+	TESTQ R13, R12
+	JNE   no_repeat_found_encodeBetterBlockAsm512K
+	LEAL  1(AX), BX
+	MOVL  12(SP), SI
+	MOVL  BX, DI
+	SUBL  16(SP), DI
+	JZ    repeat_extend_back_end_encodeBetterBlockAsm512K
+
+repeat_extend_back_loop_encodeBetterBlockAsm512K:
+	CMPL BX, SI
+	JBE  repeat_extend_back_end_encodeBetterBlockAsm512K
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(BX*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeBetterBlockAsm512K
+	LEAL -1(BX), BX
+	DECL DI
+	JNZ  repeat_extend_back_loop_encodeBetterBlockAsm512K
+
+repeat_extend_back_end_encodeBetterBlockAsm512K:
+	MOVL BX, SI
+	SUBL 12(SP), SI
+	LEAQ 4(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeBetterBlockAsm512K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBetterBlockAsm512K:
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_repeat_emit_encodeBetterBlockAsm512K
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_repeat_emit_encodeBetterBlockAsm512K
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBetterBlockAsm512K
+	CMPL SI, $0x00010000
+	JB   three_bytes_repeat_emit_encodeBetterBlockAsm512K
+	MOVL SI, R9
+	SHRL $0x10, R9
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R9, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm512K
+
+three_bytes_repeat_emit_encodeBetterBlockAsm512K:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm512K
+
+two_bytes_repeat_emit_encodeBetterBlockAsm512K:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midrepeat_emit_encodeBetterBlockAsm512K
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm512K
+
+one_byte_repeat_emit_encodeBetterBlockAsm512K:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (CX)
+	MOVQ R8, -8(CX)(DI*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm512K
+
+memmove_midrepeat_emit_encodeBetterBlockAsm512K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm512K
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm512K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm512K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm512K
+
+memmove_long_repeat_emit_encodeBetterBlockAsm512K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), R9
+	LEAQ  -32(CX)(R11*1), R12
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(CX)(R11*1)
+	MOVOA X5, -16(CX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DI, R11
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeBetterBlockAsm512K:
+	ADDL $0x05, AX
+	MOVL AX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm512K
+
+matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm512K:
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm512K
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_encodeBetterBlockAsm512K
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+
+matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm512K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm512K
+	JMP  matchlen_match8_repeat_extend_encodeBetterBlockAsm512K
+
+matchlen_bsf_16repeat_extend_encodeBetterBlockAsm512K:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm512K
+
+matchlen_match8_repeat_extend_encodeBetterBlockAsm512K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBetterBlockAsm512K
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm512K
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_encodeBetterBlockAsm512K
+
+matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm512K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm512K
+
+matchlen_match4_repeat_extend_encodeBetterBlockAsm512K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBetterBlockAsm512K
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_encodeBetterBlockAsm512K
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_encodeBetterBlockAsm512K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBetterBlockAsm512K
+	JB   repeat_extend_forward_end_encodeBetterBlockAsm512K
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_encodeBetterBlockAsm512K
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_encodeBetterBlockAsm512K
+
+matchlen_match1_repeat_extend_encodeBetterBlockAsm512K:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  repeat_extend_forward_end_encodeBetterBlockAsm512K
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_encodeBetterBlockAsm512K:
+	ADDL R10, AX
+	MOVL AX, SI
+	SUBL BX, SI
+	MOVL 16(SP), BX
+
+	// emitRepeat
+	LEAL -1(SI), BX
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBetterBlockAsm512K
+	LEAL -30(SI), BX
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBetterBlockAsm512K
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBetterBlockAsm512K
+	MOVB $0xfc, (CX)
+	MOVL BX, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm512K
+
+repeat_three_match_repeat_encodeBetterBlockAsm512K:
+	MOVB $0xf4, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm512K
+
+repeat_two_match_repeat_encodeBetterBlockAsm512K:
+	MOVB $0xec, (CX)
+	MOVB BL, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm512K
+
+repeat_one_match_repeat_encodeBetterBlockAsm512K:
+	XORL BX, BX
+	LEAL -4(BX)(SI*8), BX
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBetterBlockAsm512K:
+	MOVL AX, 12(SP)
+	JMP  search_loop_encodeBetterBlockAsm512K
+
+no_repeat_found_encodeBetterBlockAsm512K:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm512K
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm512K
+	MOVL 20(SP), AX
+	JMP  search_loop_encodeBetterBlockAsm512K
+
+candidateS_match_encodeBetterBlockAsm512K:
+	SHRQ  $0x08, DI
+	MOVQ  DI, R10
+	SHLQ  $0x08, R10
+	IMULQ R9, R10
+	SHRQ  $0x30, R10
+	MOVL  (BX)(R10*4), SI
+	INCL  AX
+	MOVL  AX, (BX)(R10*4)
+	CMPL  (DX)(SI*1), DI
+	JEQ   candidate_match_encodeBetterBlockAsm512K
+	DECL  AX
+	MOVL  R8, SI
+
+candidate_match_encodeBetterBlockAsm512K:
+	MOVL  12(SP), BX
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm512K
+
+match_extend_back_loop_encodeBetterBlockAsm512K:
+	CMPL AX, BX
+	JBE  match_extend_back_end_encodeBetterBlockAsm512K
+	MOVB -1(DX)(SI*1), DI
+	MOVB -1(DX)(AX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm512K
+	LEAL -1(AX), AX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm512K
+	JMP  match_extend_back_loop_encodeBetterBlockAsm512K
+
+match_extend_back_end_encodeBetterBlockAsm512K:
+	MOVL AX, BX
+	SUBL 12(SP), BX
+	LEAQ 4(CX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm512K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm512K:
+	MOVL AX, BX
+	ADDL $0x04, AX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), R9
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm512K
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm512K:
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm512K
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm512K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm512K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm512K
+	JMP  matchlen_match8_match_nolit_encodeBetterBlockAsm512K
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm512K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm512K
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm512K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm512K
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm512K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm512K
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm512K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm512K
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm512K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm512K
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm512K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm512K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm512K
+	JB   match_nolit_end_encodeBetterBlockAsm512K
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm512K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBetterBlockAsm512K
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm512K:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeBetterBlockAsm512K
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBetterBlockAsm512K:
+	MOVL AX, DI
+	SUBL SI, DI
+	CMPL R11, $0x01
+	JA   match_length_ok_encodeBetterBlockAsm512K
+	CMPL DI, $0x0001003f
+	JBE  match_length_ok_encodeBetterBlockAsm512K
+	MOVL 20(SP), AX
+	INCL AX
+	JMP  search_loop_encodeBetterBlockAsm512K
+
+match_length_ok_encodeBetterBlockAsm512K:
+	MOVL DI, 16(SP)
+
+	// Check if we can combine lit+copy
+	MOVLQZX 12(SP), R8
+	MOVL    BX, SI
+	SUBL    R8, SI
+	JZ      match_emit_nolits_encodeBetterBlockAsm512K
+	CMPL    DI, $0x00000040
+	JL      match_emit_lits_encodeBetterBlockAsm512K
+	CMPL    DI, $0x0001003f
+	JA      match_emit_copy3_encodeBetterBlockAsm512K
+	CMPL    SI, $0x04
+	JA      match_emit_lits_encodeBetterBlockAsm512K
+	MOVL    (DX)(R8*1), R8
+	ADDL    R11, AX
+	ADDL    $0x04, R11
+	MOVL    AX, 12(SP)
+
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, DI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    DI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, DI
+	CMOVLLT R11, DI
+	LEAL    -1(SI)(DI*4), DI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(DI*8), DI
+	MOVB    DI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    R8, (CX)
+	ADDQ    SI, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm512K
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm512K
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm512K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm512K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm512K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm512K:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+match_emit_copy3_encodeBetterBlockAsm512K:
+	CMPL    SI, $0x03
+	JA      match_emit_lits_encodeBetterBlockAsm512K
+	MOVLQZX 12(SP), R8
+	MOVL    (DX)(R8*1), R8
+	ADDL    R11, AX
+	ADDL    $0x04, R11
+	MOVL    AX, 12(SP)
+
+	// emitCopy3
+	LEAL -4(R11), R11
+	LEAL -65536(DI), DI
+	SHLL $0x0b, DI
+	LEAL 7(DI)(SI*8), DI
+	CMPL R11, $0x3c
+	JBE  emit_copy3_0_match_emit_lits_encodeBetterBlockAsm512K
+	LEAL -60(R11), R9
+	CMPL R11, $0x0000013c
+	JB   emit_copy3_1_match_emit_lits_encodeBetterBlockAsm512K
+	CMPL R11, $0x0001003c
+	JB   emit_copy3_2_match_emit_lits_encodeBetterBlockAsm512K
+	ADDL $0x000007e0, DI
+	MOVL DI, (CX)
+	MOVL R9, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_emit_copy_litsencodeBetterBlockAsm512K
+
+emit_copy3_2_match_emit_lits_encodeBetterBlockAsm512K:
+	ADDL $0x000007c0, DI
+	MOVL DI, (CX)
+	MOVW R9, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_emit_copy_litsencodeBetterBlockAsm512K
+
+emit_copy3_1_match_emit_lits_encodeBetterBlockAsm512K:
+	ADDL $0x000007a0, DI
+	MOVL DI, (CX)
+	MOVB R9, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_emit_copy_litsencodeBetterBlockAsm512K
+
+emit_copy3_0_match_emit_lits_encodeBetterBlockAsm512K:
+	SHLL $0x05, R11
+	ORL  R11, DI
+	MOVL DI, (CX)
+	ADDQ $0x04, CX
+
+match_emit_copy_litsencodeBetterBlockAsm512K:
+	MOVL R8, (CX)
+	ADDQ SI, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+match_emit_lits_encodeBetterBlockAsm512K:
+	LEAQ (DX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBetterBlockAsm512K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm512K
+	CMPL R9, $0x00010000
+	JB   three_bytes_match_emit_encodeBetterBlockAsm512K
+	MOVL R9, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R9, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm512K
+
+three_bytes_match_emit_encodeBetterBlockAsm512K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm512K
+
+two_bytes_match_emit_encodeBetterBlockAsm512K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBetterBlockAsm512K
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm512K
+
+one_byte_match_emit_encodeBetterBlockAsm512K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ SI, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8:
+	MOVQ (R8), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm512K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8through16:
+	MOVQ (R8), R10
+	MOVQ -8(R8)(SI*1), R8
+	MOVQ R10, (CX)
+	MOVQ R8, -8(CX)(SI*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm512K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm512K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm512K:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm512K
+
+memmove_midmatch_emit_encodeBetterBlockAsm512K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBetterBlockAsm512K
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm512K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_match_emit_encodeBetterBlockAsm512K:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm512K
+
+memmove_long_match_emit_encodeBetterBlockAsm512K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R13*1), X4
+	MOVOU -16(R8)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  SI, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+match_emit_nolits_encodeBetterBlockAsm512K:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitCopy
+	CMPL DI, $0x0001003f
+	JBE  two_byte_offset_match_nolit_encodeBetterBlockAsm512K
+
+	// emitCopy3
+	LEAL -4(R11), R11
+	LEAL -65536(DI), SI
+	SHLL $0x0b, SI
+	ADDL $0x07, SI
+	CMPL R11, $0x3c
+	JBE  emit_copy3_0_match_nolit_encodeBetterBlockAsm512K_emit3
+	LEAL -60(R11), DI
+	CMPL R11, $0x0000013c
+	JB   emit_copy3_1_match_nolit_encodeBetterBlockAsm512K_emit3
+	CMPL R11, $0x0001003c
+	JB   emit_copy3_2_match_nolit_encodeBetterBlockAsm512K_emit3
+	ADDL $0x000007e0, SI
+	MOVL SI, (CX)
+	MOVL DI, 4(CX)
+	ADDQ $0x07, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+emit_copy3_2_match_nolit_encodeBetterBlockAsm512K_emit3:
+	ADDL $0x000007c0, SI
+	MOVL SI, (CX)
+	MOVW DI, 4(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+emit_copy3_1_match_nolit_encodeBetterBlockAsm512K_emit3:
+	ADDL $0x000007a0, SI
+	MOVL SI, (CX)
+	MOVB DI, 4(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+emit_copy3_0_match_nolit_encodeBetterBlockAsm512K_emit3:
+	SHLL $0x05, R11
+	ORL  R11, SI
+	MOVL SI, (CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+two_byte_offset_match_nolit_encodeBetterBlockAsm512K:
+	CMPL DI, $0x00000400
+	JA   two_byte_match_nolit_encodeBetterBlockAsm512K
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBetterBlockAsm512K
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+emit_one_longer_match_nolit_encodeBetterBlockAsm512K:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBetterBlockAsm512K
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+emit_copy1_repeat_match_nolit_encodeBetterBlockAsm512K:
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm512K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+two_byte_match_nolit_encodeBetterBlockAsm512K:
+	// emitCopy2
+	LEAL -64(DI), DI
+	LEAL -4(R11), R11
+	MOVW DI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBetterBlockAsm512K_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBetterBlockAsm512K_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBetterBlockAsm512K_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+emit_copy2_2_match_nolit_encodeBetterBlockAsm512K_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+emit_copy2_1_match_nolit_encodeBetterBlockAsm512K_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+emit_copy2_0_match_nolit_encodeBetterBlockAsm512K_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm512K
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm512K
+	CMPL SI, $0x00010000
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm512K
+	MOVL SI, R9
+	SHRL $0x10, R9
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R9, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm512K
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm512K:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm512K
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm512K:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midmatch_emit_repeat_encodeBetterBlockAsm512K
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm512K
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm512K:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (CX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (CX)
+	MOVQ R8, -8(CX)(DI*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K
+
+memmove_midmatch_emit_repeat_encodeBetterBlockAsm512K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm512K
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm512K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm512K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R12
+	SUBQ  R9, R12
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R9
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R12
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  DI, R12
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm512K:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_match_nolit_repeat_encodeBetterBlockAsm512K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_match_nolit_repeat_encodeBetterBlockAsm512K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm512K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm512K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm512K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm512K
+
+repeat_one_match_nolit_repeat_encodeBetterBlockAsm512K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm512K:
+	CMPL AX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm512K
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm512K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm512K:
+	MOVQ  tmp+48(FP), SI
+	MOVQ  $0x00cf1bbcdcbfa563, DI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(BX), BX
+	LEAQ  -2(AX), R9
+	MOVQ  (DX)(BX*1), R10
+	MOVQ  1(DX)(BX*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
+	SHLQ  $0x08, R10
+	IMULQ DI, R10
+	SHRQ  $0x30, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x33, R11
+	SHLQ  $0x08, R12
+	IMULQ DI, R12
+	SHRQ  $0x30, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x33, R13
+	LEAQ  1(BX), R8
+	LEAQ  1(R9), R14
+	MOVL  BX, (SI)(R10*4)
+	MOVL  R9, (SI)(R12*4)
+	LEAQ  1(R9)(BX*1), R10
+	SHRQ  $0x01, R10
+	ADDQ  $0x01, BX
+	SUBQ  $0x01, R9
+	MOVL  R8, 262144(SI)(R11*4)
+	MOVL  R14, 262144(SI)(R13*4)
+
+index_loop_encodeBetterBlockAsm512K:
+	CMPQ  R10, R9
+	JAE   search_loop_encodeBetterBlockAsm512K
+	MOVQ  (DX)(BX*1), R8
+	MOVQ  (DX)(R10*1), R11
+	SHLQ  $0x08, R8
+	IMULQ DI, R8
+	SHRQ  $0x30, R8
+	SHLQ  $0x08, R11
+	IMULQ DI, R11
+	SHRQ  $0x30, R11
+	MOVL  BX, (SI)(R8*4)
+	MOVL  R9, (SI)(R11*4)
+	ADDQ  $0x02, BX
+	ADDQ  $0x02, R10
+	JMP   index_loop_encodeBetterBlockAsm512K
+
+emit_remainder_encodeBetterBlockAsm512K:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 4(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm512K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm512K:
+	MOVQ src_len+32(FP), AX
+
+	// emitLiteralsDstP
+	MOVL 12(SP), BX
+	CMPL BX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm512K
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (DX)(BX*1), AX
+	SUBL BX, SI
+
+	// emitLiteral
+	LEAL -1(SI), DX
+	CMPL DX, $0x1d
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm512K
+	SUBL $0x1d, DX
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm512K
+	CMPL DX, $0x00010000
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm512K
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (CX)
+	MOVW DX, 1(CX)
+	MOVB BL, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, DX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm512K
+
+three_bytes_emit_remainder_encodeBetterBlockAsm512K:
+	MOVB $0xf0, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, DX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm512K
+
+two_bytes_emit_remainder_encodeBetterBlockAsm512K:
+	MOVB $0xe8, (CX)
+	MOVB DL, 1(CX)
+	ADDL $0x1d, DX
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_midemit_remainder_encodeBetterBlockAsm512K
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm512K
+
+one_byte_emit_remainder_encodeBetterBlockAsm512K:
+	SHLB $0x03, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -1, min move: 1
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_3
+	CMPQ BX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_4through8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_4through8:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm512K
+
+memmove_midemit_remainder_encodeBetterBlockAsm512K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -2, min move: 30
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm512K
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm512K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm512K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm512K
+
+memmove_long_emit_remainder_encodeBetterBlockAsm512K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm512K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm64K(dst []byte, src []byte, tmp *[73728]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBetterBlockAsm64K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000240, DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm64K:
+	MOVOU X0, (AX)
+	MOVOU X0, 16(AX)
+	MOVOU X0, 32(AX)
+	MOVOU X0, 48(AX)
+	MOVOU X0, 64(AX)
+	MOVOU X0, 80(AX)
+	MOVOU X0, 96(AX)
+	MOVOU X0, 112(AX)
+	ADDQ  $0x80, AX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm64K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), AX
+	LEAQ  -11(AX), DX
+	LEAQ  -8(AX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, AX
+	SUBL  AX, DX
+	LEAQ  (CX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, AX
+	MOVL  AX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm64K:
+	MOVQ    tmp+48(FP), BX
+	MOVL    AX, SI
+	SUBL    12(SP), SI
+	SHRL    $0x06, SI
+	LEAL    1(AX)(SI*1), SI
+	CMPL    SI, 8(SP)
+	JAE     emit_remainder_encodeBetterBlockAsm64K
+	MOVQ    (DX)(AX*1), DI
+	MOVL    SI, 20(SP)
+	MOVQ    $0x0000cf1bbcdcbf9b, R9
+	MOVQ    $0x9e3779b1, SI
+	MOVQ    DI, R10
+	MOVQ    DI, R11
+	SHLQ    $0x10, R10
+	IMULQ   R9, R10
+	SHRQ    $0x31, R10
+	SHLQ    $0x20, R11
+	IMULQ   SI, R11
+	SHRQ    $0x34, R11
+	MOVWLZX (BX)(R10*2), SI
+	MOVWLZX 65536(BX)(R11*2), R8
+	MOVW    AX, (BX)(R10*2)
+	MOVW    AX, 65536(BX)(R11*2)
+	MOVQ    (DX)(SI*1), R10
+	CMPQ    R10, DI
+	JEQ     candidate_match_encodeBetterBlockAsm64K
+	MOVQ    (DX)(R8*1), R11
+	CMPQ    R11, DI
+	MOVL    AX, R12
+	SUBL    16(SP), R12
+	MOVQ    (DX)(R12*1), R12
+	MOVQ    $0x000000ffffffff00, R13
+	XORQ    DI, R12
+	TESTQ   R13, R12
+	JNE     no_repeat_found_encodeBetterBlockAsm64K
+	LEAL    1(AX), BX
+	MOVL    12(SP), SI
+	MOVL    BX, DI
+	SUBL    16(SP), DI
+	JZ      repeat_extend_back_end_encodeBetterBlockAsm64K
+
+repeat_extend_back_loop_encodeBetterBlockAsm64K:
+	CMPL BX, SI
+	JBE  repeat_extend_back_end_encodeBetterBlockAsm64K
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(BX*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeBetterBlockAsm64K
+	LEAL -1(BX), BX
+	DECL DI
+	JNZ  repeat_extend_back_loop_encodeBetterBlockAsm64K
+
+repeat_extend_back_end_encodeBetterBlockAsm64K:
+	MOVL BX, SI
+	SUBL 12(SP), SI
+	LEAQ 4(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeBetterBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBetterBlockAsm64K:
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_repeat_emit_encodeBetterBlockAsm64K
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_repeat_emit_encodeBetterBlockAsm64K
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBetterBlockAsm64K
+	JB   three_bytes_repeat_emit_encodeBetterBlockAsm64K
+	MOVL SI, R9
+	SHRL $0x10, R9
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R9, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm64K
+
+three_bytes_repeat_emit_encodeBetterBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm64K
+
+two_bytes_repeat_emit_encodeBetterBlockAsm64K:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midrepeat_emit_encodeBetterBlockAsm64K
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm64K
+
+one_byte_repeat_emit_encodeBetterBlockAsm64K:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (CX)
+	MOVQ R8, -8(CX)(DI*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm64K
+
+memmove_midrepeat_emit_encodeBetterBlockAsm64K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm64K
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm64K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm64K
+
+memmove_long_repeat_emit_encodeBetterBlockAsm64K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), R9
+	LEAQ  -32(CX)(R11*1), R12
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(CX)(R11*1)
+	MOVOA X5, -16(CX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DI, R11
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeBetterBlockAsm64K:
+	ADDL $0x05, AX
+	MOVL AX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm64K
+
+matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm64K:
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm64K
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_encodeBetterBlockAsm64K
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+
+matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm64K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm64K
+	JMP  matchlen_match8_repeat_extend_encodeBetterBlockAsm64K
+
+matchlen_bsf_16repeat_extend_encodeBetterBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm64K
+
+matchlen_match8_repeat_extend_encodeBetterBlockAsm64K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBetterBlockAsm64K
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm64K
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_encodeBetterBlockAsm64K
+
+matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm64K
+
+matchlen_match4_repeat_extend_encodeBetterBlockAsm64K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBetterBlockAsm64K
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_encodeBetterBlockAsm64K
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_encodeBetterBlockAsm64K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBetterBlockAsm64K
+	JB   repeat_extend_forward_end_encodeBetterBlockAsm64K
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_encodeBetterBlockAsm64K
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_encodeBetterBlockAsm64K
+
+matchlen_match1_repeat_extend_encodeBetterBlockAsm64K:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  repeat_extend_forward_end_encodeBetterBlockAsm64K
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_encodeBetterBlockAsm64K:
+	ADDL R10, AX
+	MOVL AX, SI
+	SUBL BX, SI
+	MOVL 16(SP), BX
+
+	// emitRepeat
+	LEAL -1(SI), BX
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBetterBlockAsm64K
+	LEAL -30(SI), BX
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBetterBlockAsm64K
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBetterBlockAsm64K
+	MOVB $0xfc, (CX)
+	MOVL BX, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm64K
+
+repeat_three_match_repeat_encodeBetterBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm64K
+
+repeat_two_match_repeat_encodeBetterBlockAsm64K:
+	MOVB $0xec, (CX)
+	MOVB BL, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm64K
+
+repeat_one_match_repeat_encodeBetterBlockAsm64K:
+	XORL BX, BX
+	LEAL -4(BX)(SI*8), BX
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBetterBlockAsm64K:
+	MOVL AX, 12(SP)
+	JMP  search_loop_encodeBetterBlockAsm64K
+
+no_repeat_found_encodeBetterBlockAsm64K:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm64K
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm64K
+	MOVL 20(SP), AX
+	JMP  search_loop_encodeBetterBlockAsm64K
+
+candidateS_match_encodeBetterBlockAsm64K:
+	SHRQ    $0x08, DI
+	MOVQ    DI, R10
+	SHLQ    $0x10, R10
+	IMULQ   R9, R10
+	SHRQ    $0x31, R10
+	MOVWLZX (BX)(R10*2), SI
+	INCL    AX
+	MOVW    AX, (BX)(R10*2)
+	CMPL    (DX)(SI*1), DI
+	JEQ     candidate_match_encodeBetterBlockAsm64K
+	DECL    AX
+	MOVL    R8, SI
+
+candidate_match_encodeBetterBlockAsm64K:
+	MOVL  12(SP), BX
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm64K
+
+match_extend_back_loop_encodeBetterBlockAsm64K:
+	CMPL AX, BX
+	JBE  match_extend_back_end_encodeBetterBlockAsm64K
+	MOVB -1(DX)(SI*1), DI
+	MOVB -1(DX)(AX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm64K
+	LEAL -1(AX), AX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm64K
+	JMP  match_extend_back_loop_encodeBetterBlockAsm64K
+
+match_extend_back_end_encodeBetterBlockAsm64K:
+	MOVL AX, BX
+	SUBL 12(SP), BX
+	LEAQ 4(CX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm64K:
+	MOVL AX, BX
+	ADDL $0x04, AX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), R9
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm64K
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm64K:
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm64K
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm64K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm64K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm64K
+	JMP  matchlen_match8_match_nolit_encodeBetterBlockAsm64K
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm64K
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm64K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm64K
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm64K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm64K
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm64K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm64K
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm64K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm64K
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm64K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm64K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm64K
+	JB   match_nolit_end_encodeBetterBlockAsm64K
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm64K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBetterBlockAsm64K
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm64K:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeBetterBlockAsm64K
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBetterBlockAsm64K:
+	MOVL AX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+
+	// Check if we can combine lit+copy
+	MOVLQZX 12(SP), R8
+	MOVL    BX, SI
+	SUBL    R8, SI
+	JZ      match_emit_nolits_encodeBetterBlockAsm64K
+	CMPL    DI, $0x00000040
+	JL      match_emit_lits_encodeBetterBlockAsm64K
+	CMPL    SI, $0x04
+	JA      match_emit_lits_encodeBetterBlockAsm64K
+	MOVL    (DX)(R8*1), R8
+	ADDL    R11, AX
+	ADDL    $0x04, R11
+	MOVL    AX, 12(SP)
+
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, DI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    DI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, DI
+	CMOVLLT R11, DI
+	LEAL    -1(SI)(DI*4), DI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(DI*8), DI
+	MOVB    DI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    R8, (CX)
+	ADDQ    SI, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm64K
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm64K
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm64K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm64K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm64K:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+match_emit_lits_encodeBetterBlockAsm64K:
+	LEAQ (DX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBetterBlockAsm64K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm64K
+	JB   three_bytes_match_emit_encodeBetterBlockAsm64K
+	MOVL R9, R10
+	SHRL $0x10, R10
+	MOVB $0xf8, (CX)
+	MOVW R9, 1(CX)
+	MOVB R10, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm64K
+
+three_bytes_match_emit_encodeBetterBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm64K
+
+two_bytes_match_emit_encodeBetterBlockAsm64K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBetterBlockAsm64K
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm64K
+
+one_byte_match_emit_encodeBetterBlockAsm64K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ SI, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8:
+	MOVQ (R8), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8through16:
+	MOVQ (R8), R10
+	MOVQ -8(R8)(SI*1), R8
+	MOVQ R10, (CX)
+	MOVQ R8, -8(CX)(SI*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm64K:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm64K
+
+memmove_midmatch_emit_encodeBetterBlockAsm64K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBetterBlockAsm64K
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_match_emit_encodeBetterBlockAsm64K:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm64K
+
+memmove_long_match_emit_encodeBetterBlockAsm64K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R13*1), X4
+	MOVOU -16(R8)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  SI, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+match_emit_nolits_encodeBetterBlockAsm64K:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitCopy
+	CMPL DI, $0x00000400
+	JA   two_byte_match_nolit_encodeBetterBlockAsm64K
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBetterBlockAsm64K
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+emit_one_longer_match_nolit_encodeBetterBlockAsm64K:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBetterBlockAsm64K
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+emit_copy1_repeat_match_nolit_encodeBetterBlockAsm64K:
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm64K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+two_byte_match_nolit_encodeBetterBlockAsm64K:
+	// emitCopy2
+	LEAL -64(DI), DI
+	LEAL -4(R11), R11
+	MOVW DI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBetterBlockAsm64K_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBetterBlockAsm64K_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBetterBlockAsm64K_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+emit_copy2_2_match_nolit_encodeBetterBlockAsm64K_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+emit_copy2_1_match_nolit_encodeBetterBlockAsm64K_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+emit_copy2_0_match_nolit_encodeBetterBlockAsm64K_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm64K
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm64K
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm64K
+	MOVL SI, R9
+	SHRL $0x10, R9
+	MOVB $0xf8, (CX)
+	MOVW SI, 1(CX)
+	MOVB R9, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm64K
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm64K
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm64K:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midmatch_emit_repeat_encodeBetterBlockAsm64K
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm64K
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm64K:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (CX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (CX)
+	MOVQ R8, -8(CX)(DI*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K
+
+memmove_midmatch_emit_repeat_encodeBetterBlockAsm64K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm64K
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm64K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm64K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R12
+	SUBQ  R9, R12
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R9
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R12
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  DI, R12
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm64K:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_match_nolit_repeat_encodeBetterBlockAsm64K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_match_nolit_repeat_encodeBetterBlockAsm64K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm64K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm64K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm64K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm64K
+
+repeat_one_match_nolit_repeat_encodeBetterBlockAsm64K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm64K:
+	CMPL AX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm64K
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm64K:
+	MOVQ  tmp+48(FP), SI
+	MOVQ  $0x0000cf1bbcdcbf9b, DI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(BX), BX
+	LEAQ  -2(AX), R9
+	MOVQ  (DX)(BX*1), R10
+	MOVQ  1(DX)(BX*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
+	SHLQ  $0x10, R10
+	IMULQ DI, R10
+	SHRQ  $0x31, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x34, R11
+	SHLQ  $0x10, R12
+	IMULQ DI, R12
+	SHRQ  $0x31, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x34, R13
+	LEAQ  1(BX), R8
+	LEAQ  1(R9), R14
+	MOVW  BX, (SI)(R10*2)
+	MOVW  R9, (SI)(R12*2)
+	LEAQ  1(R9)(BX*1), R10
+	SHRQ  $0x01, R10
+	ADDQ  $0x01, BX
+	SUBQ  $0x01, R9
+	MOVW  R8, 65536(SI)(R11*2)
+	MOVW  R14, 65536(SI)(R13*2)
+
+index_loop_encodeBetterBlockAsm64K:
+	CMPQ  R10, R9
+	JAE   search_loop_encodeBetterBlockAsm64K
+	MOVQ  (DX)(BX*1), R8
+	MOVQ  (DX)(R10*1), R11
+	SHLQ  $0x10, R8
+	IMULQ DI, R8
+	SHRQ  $0x31, R8
+	SHLQ  $0x10, R11
+	IMULQ DI, R11
+	SHRQ  $0x31, R11
+	MOVW  BX, (SI)(R8*2)
+	MOVW  R9, (SI)(R11*2)
+	ADDQ  $0x02, BX
+	ADDQ  $0x02, R10
+	JMP   index_loop_encodeBetterBlockAsm64K
+
+emit_remainder_encodeBetterBlockAsm64K:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 4(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm64K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm64K:
+	MOVQ src_len+32(FP), AX
+
+	// emitLiteralsDstP
+	MOVL 12(SP), BX
+	CMPL BX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm64K
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (DX)(BX*1), AX
+	SUBL BX, SI
+
+	// emitLiteral
+	LEAL -1(SI), DX
+	CMPL DX, $0x1d
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm64K
+	SUBL $0x1d, DX
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm64K
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm64K
+	MOVL DX, BX
+	SHRL $0x10, BX
+	MOVB $0xf8, (CX)
+	MOVW DX, 1(CX)
+	MOVB BL, 3(CX)
+	ADDQ $0x04, CX
+	ADDL $0x1d, DX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm64K
+
+three_bytes_emit_remainder_encodeBetterBlockAsm64K:
+	MOVB $0xf0, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, DX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm64K
+
+two_bytes_emit_remainder_encodeBetterBlockAsm64K:
+	MOVB $0xe8, (CX)
+	MOVB DL, 1(CX)
+	ADDL $0x1d, DX
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_midemit_remainder_encodeBetterBlockAsm64K
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm64K
+
+one_byte_emit_remainder_encodeBetterBlockAsm64K:
+	SHLB $0x03, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -1, min move: 1
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_3
+	CMPQ BX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_4through8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_4through8:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm64K
+
+memmove_midemit_remainder_encodeBetterBlockAsm64K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -2, min move: 30
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm64K
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm64K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm64K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm64K
+
+memmove_long_emit_remainder_encodeBetterBlockAsm64K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm64K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm16K(dst []byte, src []byte, tmp *[36864]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBetterBlockAsm16K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000120, DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm16K:
+	MOVOU X0, (AX)
+	MOVOU X0, 16(AX)
+	MOVOU X0, 32(AX)
+	MOVOU X0, 48(AX)
+	MOVOU X0, 64(AX)
+	MOVOU X0, 80(AX)
+	MOVOU X0, 96(AX)
+	MOVOU X0, 112(AX)
+	ADDQ  $0x80, AX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm16K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), AX
+	LEAQ  -11(AX), DX
+	LEAQ  -8(AX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, AX
+	SUBL  AX, DX
+	LEAQ  (CX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, AX
+	MOVL  AX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm16K:
+	MOVQ    tmp+48(FP), BX
+	MOVL    AX, SI
+	SUBL    12(SP), SI
+	SHRL    $0x06, SI
+	LEAL    1(AX)(SI*1), SI
+	CMPL    SI, 8(SP)
+	JAE     emit_remainder_encodeBetterBlockAsm16K
+	MOVQ    (DX)(AX*1), DI
+	MOVL    SI, 20(SP)
+	MOVQ    $0x0000cf1bbcdcbf9b, R9
+	MOVQ    $0x9e3779b1, SI
+	MOVQ    DI, R10
+	MOVQ    DI, R11
+	SHLQ    $0x10, R10
+	IMULQ   R9, R10
+	SHRQ    $0x32, R10
+	SHLQ    $0x20, R11
+	IMULQ   SI, R11
+	SHRQ    $0x35, R11
+	MOVWLZX (BX)(R10*2), SI
+	MOVWLZX 32768(BX)(R11*2), R8
+	MOVW    AX, (BX)(R10*2)
+	MOVW    AX, 32768(BX)(R11*2)
+	MOVQ    (DX)(SI*1), R10
+	CMPQ    R10, DI
+	JEQ     candidate_match_encodeBetterBlockAsm16K
+	MOVQ    (DX)(R8*1), R11
+	CMPQ    R11, DI
+	MOVL    AX, R12
+	SUBL    16(SP), R12
+	MOVQ    (DX)(R12*1), R12
+	MOVQ    $0x000000ffffffff00, R13
+	XORQ    DI, R12
+	TESTQ   R13, R12
+	JNE     no_repeat_found_encodeBetterBlockAsm16K
+	LEAL    1(AX), BX
+	MOVL    12(SP), SI
+	MOVL    BX, DI
+	SUBL    16(SP), DI
+	JZ      repeat_extend_back_end_encodeBetterBlockAsm16K
+
+repeat_extend_back_loop_encodeBetterBlockAsm16K:
+	CMPL BX, SI
+	JBE  repeat_extend_back_end_encodeBetterBlockAsm16K
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(BX*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeBetterBlockAsm16K
+	LEAL -1(BX), BX
+	DECL DI
+	JNZ  repeat_extend_back_loop_encodeBetterBlockAsm16K
+
+repeat_extend_back_end_encodeBetterBlockAsm16K:
+	MOVL BX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeBetterBlockAsm16K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBetterBlockAsm16K:
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_repeat_emit_encodeBetterBlockAsm16K
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_repeat_emit_encodeBetterBlockAsm16K
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBetterBlockAsm16K
+	JB   three_bytes_repeat_emit_encodeBetterBlockAsm16K
+
+three_bytes_repeat_emit_encodeBetterBlockAsm16K:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm16K
+
+two_bytes_repeat_emit_encodeBetterBlockAsm16K:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midrepeat_emit_encodeBetterBlockAsm16K
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm16K
+
+one_byte_repeat_emit_encodeBetterBlockAsm16K:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (CX)
+	MOVQ R8, -8(CX)(DI*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm16K
+
+memmove_midrepeat_emit_encodeBetterBlockAsm16K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm16K
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm16K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm16K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm16K
+
+memmove_long_repeat_emit_encodeBetterBlockAsm16K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), R9
+	LEAQ  -32(CX)(R11*1), R12
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(CX)(R11*1)
+	MOVOA X5, -16(CX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DI, R11
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeBetterBlockAsm16K:
+	ADDL $0x05, AX
+	MOVL AX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm16K
+
+matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm16K:
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm16K
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_encodeBetterBlockAsm16K
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+
+matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm16K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm16K
+	JMP  matchlen_match8_repeat_extend_encodeBetterBlockAsm16K
+
+matchlen_bsf_16repeat_extend_encodeBetterBlockAsm16K:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm16K
+
+matchlen_match8_repeat_extend_encodeBetterBlockAsm16K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBetterBlockAsm16K
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm16K
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_encodeBetterBlockAsm16K
+
+matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm16K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm16K
+
+matchlen_match4_repeat_extend_encodeBetterBlockAsm16K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBetterBlockAsm16K
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_encodeBetterBlockAsm16K
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_encodeBetterBlockAsm16K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBetterBlockAsm16K
+	JB   repeat_extend_forward_end_encodeBetterBlockAsm16K
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_encodeBetterBlockAsm16K
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_encodeBetterBlockAsm16K
+
+matchlen_match1_repeat_extend_encodeBetterBlockAsm16K:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  repeat_extend_forward_end_encodeBetterBlockAsm16K
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_encodeBetterBlockAsm16K:
+	ADDL R10, AX
+	MOVL AX, SI
+	SUBL BX, SI
+	MOVL 16(SP), BX
+
+	// emitRepeat
+	LEAL -1(SI), BX
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBetterBlockAsm16K
+	LEAL -30(SI), BX
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBetterBlockAsm16K
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBetterBlockAsm16K
+	MOVB $0xfc, (CX)
+	MOVL BX, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm16K
+
+repeat_three_match_repeat_encodeBetterBlockAsm16K:
+	MOVB $0xf4, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm16K
+
+repeat_two_match_repeat_encodeBetterBlockAsm16K:
+	MOVB $0xec, (CX)
+	MOVB BL, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm16K
+
+repeat_one_match_repeat_encodeBetterBlockAsm16K:
+	XORL BX, BX
+	LEAL -4(BX)(SI*8), BX
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBetterBlockAsm16K:
+	MOVL AX, 12(SP)
+	JMP  search_loop_encodeBetterBlockAsm16K
+
+no_repeat_found_encodeBetterBlockAsm16K:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm16K
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm16K
+	MOVL 20(SP), AX
+	JMP  search_loop_encodeBetterBlockAsm16K
+
+candidateS_match_encodeBetterBlockAsm16K:
+	SHRQ    $0x08, DI
+	MOVQ    DI, R10
+	SHLQ    $0x10, R10
+	IMULQ   R9, R10
+	SHRQ    $0x32, R10
+	MOVWLZX (BX)(R10*2), SI
+	INCL    AX
+	MOVW    AX, (BX)(R10*2)
+	CMPL    (DX)(SI*1), DI
+	JEQ     candidate_match_encodeBetterBlockAsm16K
+	DECL    AX
+	MOVL    R8, SI
+
+candidate_match_encodeBetterBlockAsm16K:
+	MOVL  12(SP), BX
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm16K
+
+match_extend_back_loop_encodeBetterBlockAsm16K:
+	CMPL AX, BX
+	JBE  match_extend_back_end_encodeBetterBlockAsm16K
+	MOVB -1(DX)(SI*1), DI
+	MOVB -1(DX)(AX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm16K
+	LEAL -1(AX), AX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm16K
+	JMP  match_extend_back_loop_encodeBetterBlockAsm16K
+
+match_extend_back_end_encodeBetterBlockAsm16K:
+	MOVL AX, BX
+	SUBL 12(SP), BX
+	LEAQ 3(CX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm16K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm16K:
+	MOVL AX, BX
+	ADDL $0x04, AX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), R9
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm16K
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm16K:
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm16K
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm16K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm16K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm16K
+	JMP  matchlen_match8_match_nolit_encodeBetterBlockAsm16K
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm16K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm16K
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm16K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm16K
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm16K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm16K
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm16K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm16K
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm16K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm16K
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm16K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm16K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm16K
+	JB   match_nolit_end_encodeBetterBlockAsm16K
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm16K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBetterBlockAsm16K
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm16K:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeBetterBlockAsm16K
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBetterBlockAsm16K:
+	MOVL AX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+
+	// Check if we can combine lit+copy
+	MOVLQZX 12(SP), R8
+	MOVL    BX, SI
+	SUBL    R8, SI
+	JZ      match_emit_nolits_encodeBetterBlockAsm16K
+	CMPL    DI, $0x00000040
+	JL      match_emit_lits_encodeBetterBlockAsm16K
+	CMPL    SI, $0x04
+	JA      match_emit_lits_encodeBetterBlockAsm16K
+	MOVL    (DX)(R8*1), R8
+	ADDL    R11, AX
+	ADDL    $0x04, R11
+	MOVL    AX, 12(SP)
+
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, DI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    DI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, DI
+	CMOVLLT R11, DI
+	LEAL    -1(SI)(DI*4), DI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(DI*8), DI
+	MOVB    DI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    R8, (CX)
+	ADDQ    SI, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm16K
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm16K
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm16K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm16K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm16K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm16K:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+match_emit_lits_encodeBetterBlockAsm16K:
+	LEAQ (DX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBetterBlockAsm16K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm16K
+	JB   three_bytes_match_emit_encodeBetterBlockAsm16K
+
+three_bytes_match_emit_encodeBetterBlockAsm16K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm16K
+
+two_bytes_match_emit_encodeBetterBlockAsm16K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBetterBlockAsm16K
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm16K
+
+one_byte_match_emit_encodeBetterBlockAsm16K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ SI, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8:
+	MOVQ (R8), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm16K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8through16:
+	MOVQ (R8), R10
+	MOVQ -8(R8)(SI*1), R8
+	MOVQ R10, (CX)
+	MOVQ R8, -8(CX)(SI*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm16K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm16K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm16K:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm16K
+
+memmove_midmatch_emit_encodeBetterBlockAsm16K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBetterBlockAsm16K
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm16K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_match_emit_encodeBetterBlockAsm16K:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm16K
+
+memmove_long_match_emit_encodeBetterBlockAsm16K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R13*1), X4
+	MOVOU -16(R8)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  SI, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+match_emit_nolits_encodeBetterBlockAsm16K:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitCopy
+	CMPL DI, $0x00000400
+	JA   two_byte_match_nolit_encodeBetterBlockAsm16K
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBetterBlockAsm16K
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+emit_one_longer_match_nolit_encodeBetterBlockAsm16K:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBetterBlockAsm16K
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+emit_copy1_repeat_match_nolit_encodeBetterBlockAsm16K:
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm16K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+two_byte_match_nolit_encodeBetterBlockAsm16K:
+	// emitCopy2
+	LEAL -64(DI), DI
+	LEAL -4(R11), R11
+	MOVW DI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBetterBlockAsm16K_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBetterBlockAsm16K_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBetterBlockAsm16K_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+emit_copy2_2_match_nolit_encodeBetterBlockAsm16K_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+emit_copy2_1_match_nolit_encodeBetterBlockAsm16K_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+emit_copy2_0_match_nolit_encodeBetterBlockAsm16K_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm16K
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm16K
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm16K
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm16K:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm16K
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm16K:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midmatch_emit_repeat_encodeBetterBlockAsm16K
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm16K
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm16K:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (CX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (CX)
+	MOVQ R8, -8(CX)(DI*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K
+
+memmove_midmatch_emit_repeat_encodeBetterBlockAsm16K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm16K
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm16K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm16K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R12
+	SUBQ  R9, R12
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R9
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R12
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  DI, R12
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm16K:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_match_nolit_repeat_encodeBetterBlockAsm16K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_match_nolit_repeat_encodeBetterBlockAsm16K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm16K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm16K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm16K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm16K
+
+repeat_one_match_nolit_repeat_encodeBetterBlockAsm16K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm16K:
+	CMPL AX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm16K
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm16K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm16K:
+	MOVQ  tmp+48(FP), SI
+	MOVQ  $0x0000cf1bbcdcbf9b, DI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(BX), BX
+	LEAQ  -2(AX), R9
+	MOVQ  (DX)(BX*1), R10
+	MOVQ  1(DX)(BX*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
+	SHLQ  $0x10, R10
+	IMULQ DI, R10
+	SHRQ  $0x32, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x35, R11
+	SHLQ  $0x10, R12
+	IMULQ DI, R12
+	SHRQ  $0x32, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x35, R13
+	LEAQ  1(BX), R8
+	LEAQ  1(R9), R14
+	MOVW  BX, (SI)(R10*2)
+	MOVW  R9, (SI)(R12*2)
+	LEAQ  1(R9)(BX*1), R10
+	SHRQ  $0x01, R10
+	ADDQ  $0x01, BX
+	SUBQ  $0x01, R9
+	MOVW  R8, 32768(SI)(R11*2)
+	MOVW  R14, 32768(SI)(R13*2)
+
+index_loop_encodeBetterBlockAsm16K:
+	CMPQ  R10, R9
+	JAE   search_loop_encodeBetterBlockAsm16K
+	MOVQ  (DX)(BX*1), R8
+	MOVQ  (DX)(R10*1), R11
+	SHLQ  $0x10, R8
+	IMULQ DI, R8
+	SHRQ  $0x32, R8
+	SHLQ  $0x10, R11
+	IMULQ DI, R11
+	SHRQ  $0x32, R11
+	MOVW  BX, (SI)(R8*2)
+	MOVW  R9, (SI)(R11*2)
+	ADDQ  $0x02, BX
+	ADDQ  $0x02, R10
+	JMP   index_loop_encodeBetterBlockAsm16K
+
+emit_remainder_encodeBetterBlockAsm16K:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm16K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm16K:
+	MOVQ src_len+32(FP), AX
+
+	// emitLiteralsDstP
+	MOVL 12(SP), BX
+	CMPL BX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm16K
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (DX)(BX*1), AX
+	SUBL BX, SI
+
+	// emitLiteral
+	LEAL -1(SI), DX
+	CMPL DX, $0x1d
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm16K
+	SUBL $0x1d, DX
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm16K
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm16K
+
+three_bytes_emit_remainder_encodeBetterBlockAsm16K:
+	MOVB $0xf0, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, DX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm16K
+
+two_bytes_emit_remainder_encodeBetterBlockAsm16K:
+	MOVB $0xe8, (CX)
+	MOVB DL, 1(CX)
+	ADDL $0x1d, DX
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_midemit_remainder_encodeBetterBlockAsm16K
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm16K
+
+one_byte_emit_remainder_encodeBetterBlockAsm16K:
+	SHLB $0x03, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -1, min move: 1
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_3
+	CMPQ BX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_4through8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_4through8:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm16K
+
+memmove_midemit_remainder_encodeBetterBlockAsm16K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -2, min move: 30
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm16K
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm16K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm16K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm16K
+
+memmove_long_emit_remainder_encodeBetterBlockAsm16K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm16K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm4K(dst []byte, src []byte, tmp *[10240]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBetterBlockAsm4K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000050, DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm4K:
+	MOVOU X0, (AX)
+	MOVOU X0, 16(AX)
+	MOVOU X0, 32(AX)
+	MOVOU X0, 48(AX)
+	MOVOU X0, 64(AX)
+	MOVOU X0, 80(AX)
+	MOVOU X0, 96(AX)
+	MOVOU X0, 112(AX)
+	ADDQ  $0x80, AX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm4K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), AX
+	LEAQ  -11(AX), DX
+	LEAQ  -8(AX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, AX
+	SUBL  AX, DX
+	LEAQ  (CX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, AX
+	MOVL  AX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm4K:
+	MOVQ    tmp+48(FP), BX
+	MOVL    AX, SI
+	SUBL    12(SP), SI
+	SHRL    $0x05, SI
+	LEAL    1(AX)(SI*1), SI
+	CMPL    SI, 8(SP)
+	JAE     emit_remainder_encodeBetterBlockAsm4K
+	MOVQ    (DX)(AX*1), DI
+	MOVL    SI, 20(SP)
+	MOVQ    $0x0000cf1bbcdcbf9b, R9
+	MOVQ    $0x9e3779b1, SI
+	MOVQ    DI, R10
+	MOVQ    DI, R11
+	SHLQ    $0x10, R10
+	IMULQ   R9, R10
+	SHRQ    $0x34, R10
+	SHLQ    $0x20, R11
+	IMULQ   SI, R11
+	SHRQ    $0x36, R11
+	MOVWLZX (BX)(R10*2), SI
+	MOVWLZX 8192(BX)(R11*2), R8
+	MOVW    AX, (BX)(R10*2)
+	MOVW    AX, 8192(BX)(R11*2)
+	MOVQ    (DX)(SI*1), R10
+	CMPQ    R10, DI
+	JEQ     candidate_match_encodeBetterBlockAsm4K
+	MOVQ    (DX)(R8*1), R11
+	CMPQ    R11, DI
+	MOVL    AX, R12
+	SUBL    16(SP), R12
+	MOVQ    (DX)(R12*1), R12
+	MOVQ    $0x000000ffffffff00, R13
+	XORQ    DI, R12
+	TESTQ   R13, R12
+	JNE     no_repeat_found_encodeBetterBlockAsm4K
+	LEAL    1(AX), BX
+	MOVL    12(SP), SI
+	MOVL    BX, DI
+	SUBL    16(SP), DI
+	JZ      repeat_extend_back_end_encodeBetterBlockAsm4K
+
+repeat_extend_back_loop_encodeBetterBlockAsm4K:
+	CMPL BX, SI
+	JBE  repeat_extend_back_end_encodeBetterBlockAsm4K
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(BX*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeBetterBlockAsm4K
+	LEAL -1(BX), BX
+	DECL DI
+	JNZ  repeat_extend_back_loop_encodeBetterBlockAsm4K
+
+repeat_extend_back_end_encodeBetterBlockAsm4K:
+	MOVL BX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeBetterBlockAsm4K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBetterBlockAsm4K:
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_repeat_emit_encodeBetterBlockAsm4K
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_repeat_emit_encodeBetterBlockAsm4K
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBetterBlockAsm4K
+	JB   three_bytes_repeat_emit_encodeBetterBlockAsm4K
+
+three_bytes_repeat_emit_encodeBetterBlockAsm4K:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm4K
+
+two_bytes_repeat_emit_encodeBetterBlockAsm4K:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midrepeat_emit_encodeBetterBlockAsm4K
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm4K
+
+one_byte_repeat_emit_encodeBetterBlockAsm4K:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (CX)
+	MOVQ R8, -8(CX)(DI*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm4K
+
+memmove_midrepeat_emit_encodeBetterBlockAsm4K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm4K
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm4K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm4K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm4K
+
+memmove_long_repeat_emit_encodeBetterBlockAsm4K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), R9
+	LEAQ  -32(CX)(R11*1), R12
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(CX)(R11*1)
+	MOVOA X5, -16(CX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DI, R11
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeBetterBlockAsm4K:
+	ADDL $0x05, AX
+	MOVL AX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm4K
+
+matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm4K:
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm4K
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_encodeBetterBlockAsm4K
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+
+matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm4K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm4K
+	JMP  matchlen_match8_repeat_extend_encodeBetterBlockAsm4K
+
+matchlen_bsf_16repeat_extend_encodeBetterBlockAsm4K:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm4K
+
+matchlen_match8_repeat_extend_encodeBetterBlockAsm4K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBetterBlockAsm4K
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm4K
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_encodeBetterBlockAsm4K
+
+matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm4K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm4K
+
+matchlen_match4_repeat_extend_encodeBetterBlockAsm4K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBetterBlockAsm4K
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_encodeBetterBlockAsm4K
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_encodeBetterBlockAsm4K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBetterBlockAsm4K
+	JB   repeat_extend_forward_end_encodeBetterBlockAsm4K
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_encodeBetterBlockAsm4K
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_encodeBetterBlockAsm4K
+
+matchlen_match1_repeat_extend_encodeBetterBlockAsm4K:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  repeat_extend_forward_end_encodeBetterBlockAsm4K
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_encodeBetterBlockAsm4K:
+	ADDL R10, AX
+	MOVL AX, SI
+	SUBL BX, SI
+	MOVL 16(SP), BX
+
+	// emitRepeat
+	LEAL -1(SI), BX
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBetterBlockAsm4K
+	LEAL -30(SI), BX
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBetterBlockAsm4K
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBetterBlockAsm4K
+	MOVB $0xfc, (CX)
+	MOVL BX, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm4K
+
+repeat_three_match_repeat_encodeBetterBlockAsm4K:
+	MOVB $0xf4, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm4K
+
+repeat_two_match_repeat_encodeBetterBlockAsm4K:
+	MOVB $0xec, (CX)
+	MOVB BL, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm4K
+
+repeat_one_match_repeat_encodeBetterBlockAsm4K:
+	XORL BX, BX
+	LEAL -4(BX)(SI*8), BX
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBetterBlockAsm4K:
+	MOVL AX, 12(SP)
+	JMP  search_loop_encodeBetterBlockAsm4K
+
+no_repeat_found_encodeBetterBlockAsm4K:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm4K
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm4K
+	MOVL 20(SP), AX
+	JMP  search_loop_encodeBetterBlockAsm4K
+
+candidateS_match_encodeBetterBlockAsm4K:
+	SHRQ    $0x08, DI
+	MOVQ    DI, R10
+	SHLQ    $0x10, R10
+	IMULQ   R9, R10
+	SHRQ    $0x34, R10
+	MOVWLZX (BX)(R10*2), SI
+	INCL    AX
+	MOVW    AX, (BX)(R10*2)
+	CMPL    (DX)(SI*1), DI
+	JEQ     candidate_match_encodeBetterBlockAsm4K
+	DECL    AX
+	MOVL    R8, SI
+
+candidate_match_encodeBetterBlockAsm4K:
+	MOVL  12(SP), BX
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm4K
+
+match_extend_back_loop_encodeBetterBlockAsm4K:
+	CMPL AX, BX
+	JBE  match_extend_back_end_encodeBetterBlockAsm4K
+	MOVB -1(DX)(SI*1), DI
+	MOVB -1(DX)(AX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm4K
+	LEAL -1(AX), AX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm4K
+	JMP  match_extend_back_loop_encodeBetterBlockAsm4K
+
+match_extend_back_end_encodeBetterBlockAsm4K:
+	MOVL AX, BX
+	SUBL 12(SP), BX
+	LEAQ 3(CX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm4K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm4K:
+	MOVL AX, BX
+	ADDL $0x04, AX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), R9
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm4K
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4K:
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4K
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm4K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm4K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4K
+	JMP  matchlen_match8_match_nolit_encodeBetterBlockAsm4K
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm4K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm4K
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm4K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm4K
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm4K
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm4K
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm4K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm4K
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm4K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm4K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm4K
+	JB   match_nolit_end_encodeBetterBlockAsm4K
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm4K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBetterBlockAsm4K
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm4K:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeBetterBlockAsm4K
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBetterBlockAsm4K:
+	MOVL AX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+
+	// Check if we can combine lit+copy
+	MOVLQZX 12(SP), R8
+	MOVL    BX, SI
+	SUBL    R8, SI
+	JZ      match_emit_nolits_encodeBetterBlockAsm4K
+	CMPL    DI, $0x00000040
+	JL      match_emit_lits_encodeBetterBlockAsm4K
+	CMPL    SI, $0x04
+	JA      match_emit_lits_encodeBetterBlockAsm4K
+	MOVL    (DX)(R8*1), R8
+	ADDL    R11, AX
+	ADDL    $0x04, R11
+	MOVL    AX, 12(SP)
+
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, DI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    DI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, DI
+	CMOVLLT R11, DI
+	LEAL    -1(SI)(DI*4), DI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(DI*8), DI
+	MOVB    DI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    R8, (CX)
+	ADDQ    SI, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm4K
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm4K
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm4K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm4K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm4K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm4K:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+match_emit_lits_encodeBetterBlockAsm4K:
+	LEAQ (DX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBetterBlockAsm4K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm4K
+	JB   three_bytes_match_emit_encodeBetterBlockAsm4K
+
+three_bytes_match_emit_encodeBetterBlockAsm4K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm4K
+
+two_bytes_match_emit_encodeBetterBlockAsm4K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBetterBlockAsm4K
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm4K
+
+one_byte_match_emit_encodeBetterBlockAsm4K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ SI, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8:
+	MOVQ (R8), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8through16:
+	MOVQ (R8), R10
+	MOVQ -8(R8)(SI*1), R8
+	MOVQ R10, (CX)
+	MOVQ R8, -8(CX)(SI*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm4K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm4K:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm4K
+
+memmove_midmatch_emit_encodeBetterBlockAsm4K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBetterBlockAsm4K
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm4K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_match_emit_encodeBetterBlockAsm4K:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm4K
+
+memmove_long_match_emit_encodeBetterBlockAsm4K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R13*1), X4
+	MOVOU -16(R8)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  SI, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+match_emit_nolits_encodeBetterBlockAsm4K:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitCopy
+	CMPL DI, $0x00000400
+	JA   two_byte_match_nolit_encodeBetterBlockAsm4K
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBetterBlockAsm4K
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+emit_one_longer_match_nolit_encodeBetterBlockAsm4K:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBetterBlockAsm4K
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+emit_copy1_repeat_match_nolit_encodeBetterBlockAsm4K:
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm4K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+two_byte_match_nolit_encodeBetterBlockAsm4K:
+	// emitCopy2
+	LEAL -64(DI), DI
+	LEAL -4(R11), R11
+	MOVW DI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBetterBlockAsm4K_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBetterBlockAsm4K_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBetterBlockAsm4K_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+emit_copy2_2_match_nolit_encodeBetterBlockAsm4K_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+emit_copy2_1_match_nolit_encodeBetterBlockAsm4K_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+emit_copy2_0_match_nolit_encodeBetterBlockAsm4K_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm4K
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm4K
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm4K
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm4K:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4K
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm4K:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midmatch_emit_repeat_encodeBetterBlockAsm4K
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4K
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm4K:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (CX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (CX)
+	MOVQ R8, -8(CX)(DI*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K
+
+memmove_midmatch_emit_repeat_encodeBetterBlockAsm4K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm4K
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm4K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm4K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R12
+	SUBQ  R9, R12
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R9
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R12
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  DI, R12
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4K:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_match_nolit_repeat_encodeBetterBlockAsm4K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_match_nolit_repeat_encodeBetterBlockAsm4K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm4K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm4K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm4K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4K
+
+repeat_one_match_nolit_repeat_encodeBetterBlockAsm4K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm4K:
+	CMPL AX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm4K
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm4K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm4K:
+	MOVQ  tmp+48(FP), SI
+	MOVQ  $0x0000cf1bbcdcbf9b, DI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(BX), BX
+	LEAQ  -2(AX), R9
+	MOVQ  (DX)(BX*1), R10
+	MOVQ  1(DX)(BX*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
+	SHLQ  $0x10, R10
+	IMULQ DI, R10
+	SHRQ  $0x34, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x36, R11
+	SHLQ  $0x10, R12
+	IMULQ DI, R12
+	SHRQ  $0x34, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x36, R13
+	LEAQ  1(BX), R8
+	LEAQ  1(R9), R14
+	MOVW  BX, (SI)(R10*2)
+	MOVW  R9, (SI)(R12*2)
+	LEAQ  1(R9)(BX*1), R10
+	SHRQ  $0x01, R10
+	ADDQ  $0x01, BX
+	SUBQ  $0x01, R9
+	MOVW  R8, 8192(SI)(R11*2)
+	MOVW  R14, 8192(SI)(R13*2)
+
+index_loop_encodeBetterBlockAsm4K:
+	CMPQ  R10, R9
+	JAE   search_loop_encodeBetterBlockAsm4K
+	MOVQ  (DX)(BX*1), R8
+	MOVQ  (DX)(R10*1), R11
+	SHLQ  $0x10, R8
+	IMULQ DI, R8
+	SHRQ  $0x34, R8
+	SHLQ  $0x10, R11
+	IMULQ DI, R11
+	SHRQ  $0x34, R11
+	MOVW  BX, (SI)(R8*2)
+	MOVW  R9, (SI)(R11*2)
+	ADDQ  $0x02, BX
+	ADDQ  $0x02, R10
+	JMP   index_loop_encodeBetterBlockAsm4K
+
+emit_remainder_encodeBetterBlockAsm4K:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm4K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm4K:
+	MOVQ src_len+32(FP), AX
+
+	// emitLiteralsDstP
+	MOVL 12(SP), BX
+	CMPL BX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm4K
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (DX)(BX*1), AX
+	SUBL BX, SI
+
+	// emitLiteral
+	LEAL -1(SI), DX
+	CMPL DX, $0x1d
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm4K
+	SUBL $0x1d, DX
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm4K
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm4K
+
+three_bytes_emit_remainder_encodeBetterBlockAsm4K:
+	MOVB $0xf0, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, DX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4K
+
+two_bytes_emit_remainder_encodeBetterBlockAsm4K:
+	MOVB $0xe8, (CX)
+	MOVB DL, 1(CX)
+	ADDL $0x1d, DX
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_midemit_remainder_encodeBetterBlockAsm4K
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4K
+
+one_byte_emit_remainder_encodeBetterBlockAsm4K:
+	SHLB $0x03, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -1, min move: 1
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_3
+	CMPQ BX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_4through8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_4through8:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm4K
+
+memmove_midemit_remainder_encodeBetterBlockAsm4K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -2, min move: 30
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm4K
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm4K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm4K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm4K
+
+memmove_long_emit_remainder_encodeBetterBlockAsm4K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm4K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func encodeBetterBlockAsm1K(dst []byte, src []byte, tmp *[4608]byte) int
+// Requires: BMI, CMOV, SSE2
+TEXT ·encodeBetterBlockAsm1K(SB), $24-64
+	MOVQ tmp+48(FP), AX
+	MOVQ dst_base+0(FP), CX
+	MOVQ $0x00000024, DX
+	PXOR X0, X0
+
+zero_loop_encodeBetterBlockAsm1K:
+	MOVOU X0, (AX)
+	MOVOU X0, 16(AX)
+	MOVOU X0, 32(AX)
+	MOVOU X0, 48(AX)
+	MOVOU X0, 64(AX)
+	MOVOU X0, 80(AX)
+	MOVOU X0, 96(AX)
+	MOVOU X0, 112(AX)
+	ADDQ  $0x80, AX
+	DECQ  DX
+	JNZ   zero_loop_encodeBetterBlockAsm1K
+	MOVL  $0x00000000, 12(SP)
+	MOVQ  src_len+32(FP), AX
+	LEAQ  -11(AX), DX
+	LEAQ  -8(AX), BX
+	MOVL  BX, 8(SP)
+	SHRQ  $0x05, AX
+	SUBL  AX, DX
+	LEAQ  (CX)(DX*1), DX
+	MOVQ  DX, (SP)
+	MOVL  $0x00000001, AX
+	MOVL  AX, 16(SP)
+	MOVQ  src_base+24(FP), DX
+
+search_loop_encodeBetterBlockAsm1K:
+	MOVQ    tmp+48(FP), BX
+	MOVL    AX, SI
+	SUBL    12(SP), SI
+	SHRL    $0x04, SI
+	LEAL    1(AX)(SI*1), SI
+	CMPL    SI, 8(SP)
+	JAE     emit_remainder_encodeBetterBlockAsm1K
+	MOVQ    (DX)(AX*1), DI
+	MOVL    SI, 20(SP)
+	MOVQ    $0x0000cf1bbcdcbf9b, R9
+	MOVQ    $0x9e3779b1, SI
+	MOVQ    DI, R10
+	MOVQ    DI, R11
+	SHLQ    $0x10, R10
+	IMULQ   R9, R10
+	SHRQ    $0x35, R10
+	SHLQ    $0x20, R11
+	IMULQ   SI, R11
+	SHRQ    $0x38, R11
+	MOVWLZX (BX)(R10*2), SI
+	MOVWLZX 4096(BX)(R11*2), R8
+	MOVW    AX, (BX)(R10*2)
+	MOVW    AX, 4096(BX)(R11*2)
+	MOVQ    (DX)(SI*1), R10
+	CMPQ    R10, DI
+	JEQ     candidate_match_encodeBetterBlockAsm1K
+	MOVQ    (DX)(R8*1), R11
+	CMPQ    R11, DI
+	MOVL    AX, R12
+	SUBL    16(SP), R12
+	MOVQ    (DX)(R12*1), R12
+	MOVQ    $0x000000ffffffff00, R13
+	XORQ    DI, R12
+	TESTQ   R13, R12
+	JNE     no_repeat_found_encodeBetterBlockAsm1K
+	LEAL    1(AX), BX
+	MOVL    12(SP), SI
+	MOVL    BX, DI
+	SUBL    16(SP), DI
+	JZ      repeat_extend_back_end_encodeBetterBlockAsm1K
+
+repeat_extend_back_loop_encodeBetterBlockAsm1K:
+	CMPL BX, SI
+	JBE  repeat_extend_back_end_encodeBetterBlockAsm1K
+	MOVB -1(DX)(DI*1), R8
+	MOVB -1(DX)(BX*1), R9
+	CMPB R8, R9
+	JNE  repeat_extend_back_end_encodeBetterBlockAsm1K
+	LEAL -1(BX), BX
+	DECL DI
+	JNZ  repeat_extend_back_loop_encodeBetterBlockAsm1K
+
+repeat_extend_back_end_encodeBetterBlockAsm1K:
+	MOVL BX, SI
+	SUBL 12(SP), SI
+	LEAQ 3(CX)(SI*1), SI
+	CMPQ SI, (SP)
+	JB   repeat_dst_size_check_encodeBetterBlockAsm1K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+repeat_dst_size_check_encodeBetterBlockAsm1K:
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_repeat_emit_encodeBetterBlockAsm1K
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_repeat_emit_encodeBetterBlockAsm1K
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_repeat_emit_encodeBetterBlockAsm1K
+	JB   three_bytes_repeat_emit_encodeBetterBlockAsm1K
+
+three_bytes_repeat_emit_encodeBetterBlockAsm1K:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm1K
+
+two_bytes_repeat_emit_encodeBetterBlockAsm1K:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midrepeat_emit_encodeBetterBlockAsm1K
+	JMP  memmove_long_repeat_emit_encodeBetterBlockAsm1K
+
+one_byte_repeat_emit_encodeBetterBlockAsm1K:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (CX)
+	JMP  memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (CX)
+	MOVQ R8, -8(CX)(DI*1)
+	JMP  memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K
+
+emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm1K
+
+memmove_midrepeat_emit_encodeBetterBlockAsm1K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm1K
+
+emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm1K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm1K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_repeat_emit_encodeBetterBlockAsm1K
+
+memmove_long_repeat_emit_encodeBetterBlockAsm1K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R11
+	SUBQ  R9, R11
+	DECQ  R10
+	JA    emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), R9
+	LEAQ  -32(CX)(R11*1), R12
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_big_loop_back
+
+emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(CX)(R11*1)
+	MOVOA X5, -16(CX)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DI, R11
+	JAE   emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_repeat_emit_encodeBetterBlockAsm1K:
+	ADDL $0x05, AX
+	MOVL AX, SI
+	SUBL 16(SP), SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), SI
+
+	// matchLen
+	XORL R10, R10
+	JMP  matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm1K
+
+matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm1K:
+	MOVQ (R8)(R10*1), R9
+	MOVQ 8(R8)(R10*1), R11
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm1K
+	XORQ 8(SI)(R10*1), R11
+	JNZ  matchlen_bsf_16repeat_extend_encodeBetterBlockAsm1K
+	LEAL -16(DI), DI
+	LEAL 16(R10), R10
+
+matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm1K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm1K
+	JMP  matchlen_match8_repeat_extend_encodeBetterBlockAsm1K
+
+matchlen_bsf_16repeat_extend_encodeBetterBlockAsm1K:
+#ifdef GOAMD64_v3
+	TZCNTQ R11, R11
+
+#else
+	BSFQ R11, R11
+
+#endif
+	SARQ $0x03, R11
+	LEAL 8(R10)(R11*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm1K
+
+matchlen_match8_repeat_extend_encodeBetterBlockAsm1K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_repeat_extend_encodeBetterBlockAsm1K
+	MOVQ (R8)(R10*1), R9
+	XORQ (SI)(R10*1), R9
+	JNZ  matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm1K
+	LEAL -8(DI), DI
+	LEAL 8(R10), R10
+	JMP  matchlen_match4_repeat_extend_encodeBetterBlockAsm1K
+
+matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm1K:
+#ifdef GOAMD64_v3
+	TZCNTQ R9, R9
+
+#else
+	BSFQ R9, R9
+
+#endif
+	SARQ $0x03, R9
+	LEAL (R10)(R9*1), R10
+	JMP  repeat_extend_forward_end_encodeBetterBlockAsm1K
+
+matchlen_match4_repeat_extend_encodeBetterBlockAsm1K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_repeat_extend_encodeBetterBlockAsm1K
+	MOVL (R8)(R10*1), R9
+	CMPL (SI)(R10*1), R9
+	JNE  matchlen_match2_repeat_extend_encodeBetterBlockAsm1K
+	LEAL -4(DI), DI
+	LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_encodeBetterBlockAsm1K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_repeat_extend_encodeBetterBlockAsm1K
+	JB   repeat_extend_forward_end_encodeBetterBlockAsm1K
+	MOVW (R8)(R10*1), R9
+	CMPW (SI)(R10*1), R9
+	JNE  matchlen_match1_repeat_extend_encodeBetterBlockAsm1K
+	LEAL 2(R10), R10
+	SUBL $0x02, DI
+	JZ   repeat_extend_forward_end_encodeBetterBlockAsm1K
+
+matchlen_match1_repeat_extend_encodeBetterBlockAsm1K:
+	MOVB (R8)(R10*1), R9
+	CMPB (SI)(R10*1), R9
+	JNE  repeat_extend_forward_end_encodeBetterBlockAsm1K
+	LEAL 1(R10), R10
+
+repeat_extend_forward_end_encodeBetterBlockAsm1K:
+	ADDL R10, AX
+	MOVL AX, SI
+	SUBL BX, SI
+	MOVL 16(SP), BX
+
+	// emitRepeat
+	LEAL -1(SI), BX
+	CMPL SI, $0x1d
+	JBE  repeat_one_match_repeat_encodeBetterBlockAsm1K
+	LEAL -30(SI), BX
+	CMPL SI, $0x0000011e
+	JB   repeat_two_match_repeat_encodeBetterBlockAsm1K
+	CMPL SI, $0x0001001e
+	JB   repeat_three_match_repeat_encodeBetterBlockAsm1K
+	MOVB $0xfc, (CX)
+	MOVL BX, 1(CX)
+	ADDQ $0x04, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm1K
+
+repeat_three_match_repeat_encodeBetterBlockAsm1K:
+	MOVB $0xf4, (CX)
+	MOVW BX, 1(CX)
+	ADDQ $0x03, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm1K
+
+repeat_two_match_repeat_encodeBetterBlockAsm1K:
+	MOVB $0xec, (CX)
+	MOVB BL, 1(CX)
+	ADDQ $0x02, CX
+	JMP  repeat_end_emit_encodeBetterBlockAsm1K
+
+repeat_one_match_repeat_encodeBetterBlockAsm1K:
+	XORL BX, BX
+	LEAL -4(BX)(SI*8), BX
+	MOVB BL, (CX)
+	ADDQ $0x01, CX
+
+repeat_end_emit_encodeBetterBlockAsm1K:
+	MOVL AX, 12(SP)
+	JMP  search_loop_encodeBetterBlockAsm1K
+
+no_repeat_found_encodeBetterBlockAsm1K:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm1K
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm1K
+	MOVL 20(SP), AX
+	JMP  search_loop_encodeBetterBlockAsm1K
+
+candidateS_match_encodeBetterBlockAsm1K:
+	SHRQ    $0x08, DI
+	MOVQ    DI, R10
+	SHLQ    $0x10, R10
+	IMULQ   R9, R10
+	SHRQ    $0x35, R10
+	MOVWLZX (BX)(R10*2), SI
+	INCL    AX
+	MOVW    AX, (BX)(R10*2)
+	CMPL    (DX)(SI*1), DI
+	JEQ     candidate_match_encodeBetterBlockAsm1K
+	DECL    AX
+	MOVL    R8, SI
+
+candidate_match_encodeBetterBlockAsm1K:
+	MOVL  12(SP), BX
+	TESTL SI, SI
+	JZ    match_extend_back_end_encodeBetterBlockAsm1K
+
+match_extend_back_loop_encodeBetterBlockAsm1K:
+	CMPL AX, BX
+	JBE  match_extend_back_end_encodeBetterBlockAsm1K
+	MOVB -1(DX)(SI*1), DI
+	MOVB -1(DX)(AX*1), R8
+	CMPB DI, R8
+	JNE  match_extend_back_end_encodeBetterBlockAsm1K
+	LEAL -1(AX), AX
+	DECL SI
+	JZ   match_extend_back_end_encodeBetterBlockAsm1K
+	JMP  match_extend_back_loop_encodeBetterBlockAsm1K
+
+match_extend_back_end_encodeBetterBlockAsm1K:
+	MOVL AX, BX
+	SUBL 12(SP), BX
+	LEAQ 3(CX)(BX*1), BX
+	CMPQ BX, (SP)
+	JB   match_dst_size_check_encodeBetterBlockAsm1K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_dst_size_check_encodeBetterBlockAsm1K:
+	MOVL AX, BX
+	ADDL $0x04, AX
+	ADDL $0x04, SI
+	MOVQ src_len+32(FP), DI
+	SUBL AX, DI
+	LEAQ (DX)(AX*1), R8
+	LEAQ (DX)(SI*1), R9
+
+	// matchLen
+	XORL R11, R11
+	JMP  matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm1K
+
+matchlen_loopback_16_match_nolit_encodeBetterBlockAsm1K:
+	MOVQ (R8)(R11*1), R10
+	MOVQ 8(R8)(R11*1), R12
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm1K
+	XORQ 8(R9)(R11*1), R12
+	JNZ  matchlen_bsf_16match_nolit_encodeBetterBlockAsm1K
+	LEAL -16(DI), DI
+	LEAL 16(R11), R11
+
+matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm1K:
+	CMPL DI, $0x10
+	JAE  matchlen_loopback_16_match_nolit_encodeBetterBlockAsm1K
+	JMP  matchlen_match8_match_nolit_encodeBetterBlockAsm1K
+
+matchlen_bsf_16match_nolit_encodeBetterBlockAsm1K:
+#ifdef GOAMD64_v3
+	TZCNTQ R12, R12
+
+#else
+	BSFQ R12, R12
+
+#endif
+	SARQ $0x03, R12
+	LEAL 8(R11)(R12*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm1K
+
+matchlen_match8_match_nolit_encodeBetterBlockAsm1K:
+	CMPL DI, $0x08
+	JB   matchlen_match4_match_nolit_encodeBetterBlockAsm1K
+	MOVQ (R8)(R11*1), R10
+	XORQ (R9)(R11*1), R10
+	JNZ  matchlen_bsf_8_match_nolit_encodeBetterBlockAsm1K
+	LEAL -8(DI), DI
+	LEAL 8(R11), R11
+	JMP  matchlen_match4_match_nolit_encodeBetterBlockAsm1K
+
+matchlen_bsf_8_match_nolit_encodeBetterBlockAsm1K:
+#ifdef GOAMD64_v3
+	TZCNTQ R10, R10
+
+#else
+	BSFQ R10, R10
+
+#endif
+	SARQ $0x03, R10
+	LEAL (R11)(R10*1), R11
+	JMP  match_nolit_end_encodeBetterBlockAsm1K
+
+matchlen_match4_match_nolit_encodeBetterBlockAsm1K:
+	CMPL DI, $0x04
+	JB   matchlen_match2_match_nolit_encodeBetterBlockAsm1K
+	MOVL (R8)(R11*1), R10
+	CMPL (R9)(R11*1), R10
+	JNE  matchlen_match2_match_nolit_encodeBetterBlockAsm1K
+	LEAL -4(DI), DI
+	LEAL 4(R11), R11
+
+matchlen_match2_match_nolit_encodeBetterBlockAsm1K:
+	CMPL DI, $0x01
+	JE   matchlen_match1_match_nolit_encodeBetterBlockAsm1K
+	JB   match_nolit_end_encodeBetterBlockAsm1K
+	MOVW (R8)(R11*1), R10
+	CMPW (R9)(R11*1), R10
+	JNE  matchlen_match1_match_nolit_encodeBetterBlockAsm1K
+	LEAL 2(R11), R11
+	SUBL $0x02, DI
+	JZ   match_nolit_end_encodeBetterBlockAsm1K
+
+matchlen_match1_match_nolit_encodeBetterBlockAsm1K:
+	MOVB (R8)(R11*1), R10
+	CMPB (R9)(R11*1), R10
+	JNE  match_nolit_end_encodeBetterBlockAsm1K
+	LEAL 1(R11), R11
+
+match_nolit_end_encodeBetterBlockAsm1K:
+	MOVL AX, DI
+	SUBL SI, DI
+	MOVL DI, 16(SP)
+
+	// Check if we can combine lit+copy
+	MOVLQZX 12(SP), R8
+	MOVL    BX, SI
+	SUBL    R8, SI
+	JZ      match_emit_nolits_encodeBetterBlockAsm1K
+	CMPL    DI, $0x00000040
+	JL      match_emit_lits_encodeBetterBlockAsm1K
+	CMPL    SI, $0x04
+	JA      match_emit_lits_encodeBetterBlockAsm1K
+	MOVL    (DX)(R8*1), R8
+	ADDL    R11, AX
+	ADDL    $0x04, R11
+	MOVL    AX, 12(SP)
+
+	// emitCopy2WithLits
+	XORQ    R9, R9
+	SUBL    $0x40, DI
+	LEAL    -11(R11), R10
+	LEAL    -4(R11), R11
+	MOVW    DI, 1(CX)
+	CMPL    R11, $0x07
+	CMOVLGE R10, R9
+	MOVQ    $0x00000007, DI
+	CMOVLLT R11, DI
+	LEAL    -1(SI)(DI*4), DI
+	MOVL    $0x00000003, R10
+	LEAL    (R10)(DI*8), DI
+	MOVB    DI, (CX)
+	ADDQ    $0x03, CX
+	MOVL    R8, (CX)
+	ADDQ    SI, CX
+	TESTL   R9, R9
+	JZ      match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+	// emitRepeat
+	LEAL -1(R9), SI
+	CMPL R9, $0x1d
+	JBE  repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm1K
+	LEAL -30(R9), SI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm1K
+	CMPL R9, $0x0001001e
+	JB   repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm1K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm1K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm1K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm1K:
+	XORL SI, SI
+	LEAL -4(SI)(R9*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+match_emit_lits_encodeBetterBlockAsm1K:
+	LEAQ (DX)(R8*1), R8
+
+	// emitLiteral
+	LEAL -1(SI), R9
+	CMPL R9, $0x1d
+	JB   one_byte_match_emit_encodeBetterBlockAsm1K
+	SUBL $0x1d, R9
+	CMPL R9, $0x00000100
+	JB   two_bytes_match_emit_encodeBetterBlockAsm1K
+	JB   three_bytes_match_emit_encodeBetterBlockAsm1K
+
+three_bytes_match_emit_encodeBetterBlockAsm1K:
+	MOVB $0xf0, (CX)
+	MOVW R9, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, R9
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm1K
+
+two_bytes_match_emit_encodeBetterBlockAsm1K:
+	MOVB $0xe8, (CX)
+	MOVB R9, 1(CX)
+	ADDL $0x1d, R9
+	ADDQ $0x02, CX
+	CMPL R9, $0x40
+	JB   memmove_midmatch_emit_encodeBetterBlockAsm1K
+	JMP  memmove_long_match_emit_encodeBetterBlockAsm1K
+
+one_byte_match_emit_encodeBetterBlockAsm1K:
+	SHLB $0x03, R9
+	MOVB R9, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ SI, $0x08
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8
+	CMPQ SI, $0x10
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8through16
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8:
+	MOVQ (R8), R10
+	MOVQ R10, (CX)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm1K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8through16:
+	MOVQ (R8), R10
+	MOVQ -8(R8)(SI*1), R8
+	MOVQ R10, (CX)
+	MOVQ R8, -8(CX)(SI*1)
+	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm1K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm1K
+
+emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_end_copy_match_emit_encodeBetterBlockAsm1K:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm1K
+
+memmove_midmatch_emit_encodeBetterBlockAsm1K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ SI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(SI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(SI*1)
+	JMP   memmove_mid_end_copy_match_emit_encodeBetterBlockAsm1K
+
+emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm1K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+
+memmove_mid_end_copy_match_emit_encodeBetterBlockAsm1K:
+	MOVQ R9, CX
+	JMP  match_emit_nolits_encodeBetterBlockAsm1K
+
+memmove_long_match_emit_encodeBetterBlockAsm1K:
+	LEAQ (CX)(SI*1), R9
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(SI*1), X2
+	MOVOU -16(R8)(SI*1), X3
+	MOVQ  SI, R12
+	SHRQ  $0x05, R12
+	MOVQ  CX, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R12
+	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R13*1), R10
+	LEAQ  -32(CX)(R13*1), R14
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R12
+	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R13*1), X4
+	MOVOU -16(R8)(R13*1), X5
+	MOVOA X4, -32(CX)(R13*1)
+	MOVOA X5, -16(CX)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  SI, R13
+	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(SI*1)
+	MOVOU X3, -16(CX)(SI*1)
+	MOVQ  R9, CX
+
+match_emit_nolits_encodeBetterBlockAsm1K:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitCopy
+	CMPL DI, $0x00000400
+	JA   two_byte_match_nolit_encodeBetterBlockAsm1K
+	CMPL R11, $0x00000013
+	JAE  emit_one_longer_match_nolit_encodeBetterBlockAsm1K
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL -15(SI)(R11*4), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+emit_one_longer_match_nolit_encodeBetterBlockAsm1K:
+	CMPL R11, $0x00000112
+	JAE  emit_copy1_repeat_match_nolit_encodeBetterBlockAsm1K
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 61(SI), SI
+	MOVW SI, (CX)
+	LEAL -18(R11), SI
+	MOVB SI, 2(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+emit_copy1_repeat_match_nolit_encodeBetterBlockAsm1K:
+	LEAL -1(DI), SI
+	SHLL $0x06, SI
+	LEAL 57(SI), SI
+	MOVW SI, (CX)
+	ADDQ $0x02, CX
+	SUBL $0x12, R11
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm1K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+two_byte_match_nolit_encodeBetterBlockAsm1K:
+	// emitCopy2
+	LEAL -64(DI), DI
+	LEAL -4(R11), R11
+	MOVW DI, 1(CX)
+	CMPL R11, $0x3c
+	JBE  emit_copy2_0_match_nolit_encodeBetterBlockAsm1K_emit2
+	LEAL -60(R11), SI
+	CMPL R11, $0x0000013c
+	JB   emit_copy2_1_match_nolit_encodeBetterBlockAsm1K_emit2
+	CMPL R11, $0x0001003c
+	JB   emit_copy2_2_match_nolit_encodeBetterBlockAsm1K_emit2
+	MOVB $0xfe, (CX)
+	MOVL SI, 3(CX)
+	ADDQ $0x06, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+emit_copy2_2_match_nolit_encodeBetterBlockAsm1K_emit2:
+	MOVB $0xfa, (CX)
+	MOVW SI, 3(CX)
+	ADDQ $0x05, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+emit_copy2_1_match_nolit_encodeBetterBlockAsm1K_emit2:
+	MOVB $0xf6, (CX)
+	MOVB SI, 3(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+emit_copy2_0_match_nolit_encodeBetterBlockAsm1K_emit2:
+	MOVL $0x00000002, SI
+	LEAL (SI)(R11*4), SI
+	MOVB SI, (CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+	// emitLiteralsDstP
+	MOVL 12(SP), SI
+	CMPL SI, BX
+	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K
+	MOVL BX, DI
+	MOVL BX, 12(SP)
+	LEAQ (DX)(SI*1), R8
+	SUBL SI, DI
+
+	// emitLiteral
+	LEAL -1(DI), SI
+	CMPL SI, $0x1d
+	JB   one_byte_match_emit_repeat_encodeBetterBlockAsm1K
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_match_emit_repeat_encodeBetterBlockAsm1K
+	JB   three_bytes_match_emit_repeat_encodeBetterBlockAsm1K
+
+three_bytes_match_emit_repeat_encodeBetterBlockAsm1K:
+	MOVB $0xf0, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, SI
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm1K
+
+two_bytes_match_emit_repeat_encodeBetterBlockAsm1K:
+	MOVB $0xe8, (CX)
+	MOVB SI, 1(CX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, CX
+	CMPL SI, $0x40
+	JB   memmove_midmatch_emit_repeat_encodeBetterBlockAsm1K
+	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm1K
+
+one_byte_match_emit_repeat_encodeBetterBlockAsm1K:
+	SHLB $0x03, SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 1
+	CMPQ DI, $0x08
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8
+	CMPQ DI, $0x10
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8through16
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8:
+	MOVQ (R8), R9
+	MOVQ R9, (CX)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8through16:
+	MOVQ (R8), R9
+	MOVQ -8(R8)(DI*1), R8
+	MOVQ R9, (CX)
+	MOVQ R8, -8(CX)(DI*1)
+	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K
+
+emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K
+
+memmove_midmatch_emit_repeat_encodeBetterBlockAsm1K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveShort
+	// margin: 8, min move: 30
+	CMPQ DI, $0x20
+	JBE  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DI*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(DI*1)
+	JMP   memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm1K
+
+emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+
+memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm1K:
+	MOVQ SI, CX
+	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K
+
+memmove_long_match_emit_repeat_encodeBetterBlockAsm1K:
+	LEAQ (CX)(DI*1), SI
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DI*1), X2
+	MOVOU -16(R8)(DI*1), X3
+	MOVQ  DI, R10
+	SHRQ  $0x05, R10
+	MOVQ  CX, R9
+	ANDL  $0x0000001f, R9
+	MOVQ  $0x00000040, R12
+	SUBQ  R9, R12
+	DECQ  R10
+	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R12*1), R9
+	LEAQ  -32(CX)(R12*1), R13
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_big_loop_back:
+	MOVOU (R9), X4
+	MOVOU 16(R9), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, R12
+	DECQ  R10
+	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_big_loop_back
+
+emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R12*1), X4
+	MOVOU -16(R8)(R12*1), X5
+	MOVOA X4, -32(CX)(R12*1)
+	MOVOA X5, -16(CX)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  DI, R12
+	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(DI*1)
+	MOVOU X3, -16(CX)(DI*1)
+	MOVQ  SI, CX
+
+emit_literal_done_match_emit_repeat_encodeBetterBlockAsm1K:
+	ADDL R11, AX
+	ADDL $0x04, R11
+	MOVL AX, 12(SP)
+
+	// emitRepeat
+	LEAL -1(R11), SI
+	CMPL R11, $0x1d
+	JBE  repeat_one_match_nolit_repeat_encodeBetterBlockAsm1K
+	LEAL -30(R11), SI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_match_nolit_repeat_encodeBetterBlockAsm1K
+	CMPL R11, $0x0001001e
+	JB   repeat_three_match_nolit_repeat_encodeBetterBlockAsm1K
+	MOVB $0xfc, (CX)
+	MOVL SI, 1(CX)
+	ADDQ $0x04, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+repeat_three_match_nolit_repeat_encodeBetterBlockAsm1K:
+	MOVB $0xf4, (CX)
+	MOVW SI, 1(CX)
+	ADDQ $0x03, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+repeat_two_match_nolit_repeat_encodeBetterBlockAsm1K:
+	MOVB $0xec, (CX)
+	MOVB SI, 1(CX)
+	ADDQ $0x02, CX
+	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm1K
+
+repeat_one_match_nolit_repeat_encodeBetterBlockAsm1K:
+	XORL SI, SI
+	LEAL -4(SI)(R11*8), SI
+	MOVB SI, (CX)
+	ADDQ $0x01, CX
+
+match_nolit_emitcopy_end_encodeBetterBlockAsm1K:
+	CMPL AX, 8(SP)
+	JAE  emit_remainder_encodeBetterBlockAsm1K
+	CMPQ CX, (SP)
+	JB   match_nolit_dst_ok_encodeBetterBlockAsm1K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+match_nolit_dst_ok_encodeBetterBlockAsm1K:
+	MOVQ  tmp+48(FP), SI
+	MOVQ  $0x0000cf1bbcdcbf9b, DI
+	MOVQ  $0x9e3779b1, R8
+	LEAQ  1(BX), BX
+	LEAQ  -2(AX), R9
+	MOVQ  (DX)(BX*1), R10
+	MOVQ  1(DX)(BX*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
+	SHLQ  $0x10, R10
+	IMULQ DI, R10
+	SHRQ  $0x35, R10
+	SHLQ  $0x20, R11
+	IMULQ R8, R11
+	SHRQ  $0x38, R11
+	SHLQ  $0x10, R12
+	IMULQ DI, R12
+	SHRQ  $0x35, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x38, R13
+	LEAQ  1(BX), R8
+	LEAQ  1(R9), R14
+	MOVW  BX, (SI)(R10*2)
+	MOVW  R9, (SI)(R12*2)
+	LEAQ  1(R9)(BX*1), R10
+	SHRQ  $0x01, R10
+	ADDQ  $0x01, BX
+	SUBQ  $0x01, R9
+	MOVW  R8, 4096(SI)(R11*2)
+	MOVW  R14, 4096(SI)(R13*2)
+
+index_loop_encodeBetterBlockAsm1K:
+	CMPQ  R10, R9
+	JAE   search_loop_encodeBetterBlockAsm1K
+	MOVQ  (DX)(BX*1), R8
+	MOVQ  (DX)(R10*1), R11
+	SHLQ  $0x10, R8
+	IMULQ DI, R8
+	SHRQ  $0x35, R8
+	SHLQ  $0x10, R11
+	IMULQ DI, R11
+	SHRQ  $0x35, R11
+	MOVW  BX, (SI)(R8*2)
+	MOVW  R9, (SI)(R11*2)
+	ADDQ  $0x02, BX
+	ADDQ  $0x02, R10
+	JMP   index_loop_encodeBetterBlockAsm1K
+
+emit_remainder_encodeBetterBlockAsm1K:
+	MOVQ src_len+32(FP), AX
+	SUBL 12(SP), AX
+	LEAQ 3(CX)(AX*1), AX
+	CMPQ AX, (SP)
+	JB   emit_remainder_ok_encodeBetterBlockAsm1K
+	MOVQ $0x00000000, ret+56(FP)
+	RET
+
+emit_remainder_ok_encodeBetterBlockAsm1K:
+	MOVQ src_len+32(FP), AX
+
+	// emitLiteralsDstP
+	MOVL 12(SP), BX
+	CMPL BX, AX
+	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm1K
+	MOVL AX, SI
+	MOVL AX, 12(SP)
+	LEAQ (DX)(BX*1), AX
+	SUBL BX, SI
+
+	// emitLiteral
+	LEAL -1(SI), DX
+	CMPL DX, $0x1d
+	JB   one_byte_emit_remainder_encodeBetterBlockAsm1K
+	SUBL $0x1d, DX
+	CMPL DX, $0x00000100
+	JB   two_bytes_emit_remainder_encodeBetterBlockAsm1K
+	JB   three_bytes_emit_remainder_encodeBetterBlockAsm1K
+
+three_bytes_emit_remainder_encodeBetterBlockAsm1K:
+	MOVB $0xf0, (CX)
+	MOVW DX, 1(CX)
+	ADDQ $0x03, CX
+	ADDL $0x1d, DX
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm1K
+
+two_bytes_emit_remainder_encodeBetterBlockAsm1K:
+	MOVB $0xe8, (CX)
+	MOVB DL, 1(CX)
+	ADDL $0x1d, DX
+	ADDQ $0x02, CX
+	CMPL DX, $0x40
+	JB   memmove_midemit_remainder_encodeBetterBlockAsm1K
+	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm1K
+
+one_byte_emit_remainder_encodeBetterBlockAsm1K:
+	SHLB $0x03, DL
+	MOVB DL, (CX)
+	ADDQ $0x01, CX
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -1, min move: 1
+	CMPQ BX, $0x03
+	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_1or2
+	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_3
+	CMPQ BX, $0x08
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_4through8
+	CMPQ BX, $0x10
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_8through16
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_1or2:
+	MOVB (AX), SI
+	MOVB -1(AX)(BX*1), AL
+	MOVB SI, (CX)
+	MOVB AL, -1(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_3:
+	MOVW (AX), SI
+	MOVB 2(AX), AL
+	MOVW SI, (CX)
+	MOVB AL, 2(CX)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_4through8:
+	MOVL (AX), SI
+	MOVL -4(AX)(BX*1), AX
+	MOVL SI, (CX)
+	MOVL AX, -4(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_8through16:
+	MOVQ (AX), SI
+	MOVQ -8(AX)(BX*1), AX
+	MOVQ SI, (CX)
+	MOVQ AX, -8(CX)(BX*1)
+	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K
+
+emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm1K
+
+memmove_midemit_remainder_encodeBetterBlockAsm1K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveShort
+	// margin: -2, min move: 30
+	CMPQ BX, $0x20
+	JBE  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_17through32:
+	MOVOU (AX), X0
+	MOVOU -16(AX)(BX*1), X1
+	MOVOU X0, (CX)
+	MOVOU X1, -16(CX)(BX*1)
+	JMP   memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm1K
+
+emit_lit_memmove_mid_emit_remainder_encodeBetterBlockAsm1K_memmove_move_33through64:
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+
+memmove_mid_end_copy_emit_remainder_encodeBetterBlockAsm1K:
+	MOVQ DX, CX
+	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm1K
+
+memmove_long_emit_remainder_encodeBetterBlockAsm1K:
+	LEAQ (CX)(SI*1), DX
+	MOVL SI, BX
+
+	// genMemMoveLong
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU -32(AX)(BX*1), X2
+	MOVOU -16(AX)(BX*1), X3
+	MOVQ  BX, DI
+	SHRQ  $0x05, DI
+	MOVQ  CX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
+	LEAQ  -32(AX)(R8*1), SI
+	LEAQ  -32(CX)(R8*1), R9
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_big_loop_back
+
+emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_forward_sse_loop_32:
+	MOVOU -32(AX)(R8*1), X4
+	MOVOU -16(AX)(R8*1), X5
+	MOVOA X4, -32(CX)(R8*1)
+	MOVOA X5, -16(CX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  BX, R8
+	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_forward_sse_loop_32
+	MOVOU X0, (CX)
+	MOVOU X1, 16(CX)
+	MOVOU X2, -32(CX)(BX*1)
+	MOVOU X3, -16(CX)(BX*1)
+	MOVQ  DX, CX
+
+emit_literal_done_emit_remainder_encodeBetterBlockAsm1K:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, CX
+	MOVQ CX, ret+56(FP)
+	RET
+
+// func emitLiteral(dst []byte, lit []byte) int
+// Requires: SSE2
+TEXT ·emitLiteral(SB), NOSPLIT, $0-56
+	MOVQ  lit_len+32(FP), DX
+	MOVQ  dst_base+0(FP), AX
+	MOVQ  lit_base+24(FP), CX
+	TESTQ DX, DX
+	JZ    emit_literal_end_standalone_skip
+
+	// emitLiteral
+	MOVL DX, BX
+	LEAL -1(DX), SI
+	CMPL SI, $0x1d
+	JB   one_byte_standalone
+	SUBL $0x1d, SI
+	CMPL SI, $0x00000100
+	JB   two_bytes_standalone
+	CMPL SI, $0x00010000
+	JB   three_bytes_standalone
+	MOVL SI, DI
+	SHRL $0x10, DI
+	MOVB $0xf8, (AX)
+	MOVW SI, 1(AX)
+	MOVB DI, 3(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	ADDL $0x1d, SI
+	JMP  memmove_long_standalone
+
+three_bytes_standalone:
+	MOVB $0xf0, (AX)
+	MOVW SI, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	ADDL $0x1d, SI
+	JMP  memmove_long_standalone
+
+two_bytes_standalone:
+	MOVB $0xe8, (AX)
+	MOVB SI, 1(AX)
+	ADDL $0x1d, SI
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	CMPL SI, $0x40
+	JB   memmove_midstandalone
+	JMP  memmove_long_standalone
+
+one_byte_standalone:
+	SHLB $0x03, SI
+	MOVB SI, (AX)
+	ADDQ $0x01, BX
+	ADDQ $0x01, AX
+
+	// genMemMoveShort
+	// margin: 0, min move: 1
+	CMPQ DX, $0x03
+	JB   emit_lit_memmove_standalone_memmove_move_1or2
+	JE   emit_lit_memmove_standalone_memmove_move_3
+	CMPQ DX, $0x08
+	JBE  emit_lit_memmove_standalone_memmove_move_4through8
+	CMPQ DX, $0x10
+	JBE  emit_lit_memmove_standalone_memmove_move_8through16
+	CMPQ DX, $0x20
+	JBE  emit_lit_memmove_standalone_memmove_move_17through32
+	JMP  emit_lit_memmove_standalone_memmove_move_33through64
+
+emit_lit_memmove_standalone_memmove_move_1or2:
+	MOVB (CX), SI
+	MOVB -1(CX)(DX*1), CL
+	MOVB SI, (AX)
+	MOVB CL, -1(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_3:
+	MOVW (CX), SI
+	MOVB 2(CX), CL
+	MOVW SI, (AX)
+	MOVB CL, 2(AX)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_4through8:
+	MOVL (CX), SI
+	MOVL -4(CX)(DX*1), CX
+	MOVL SI, (AX)
+	MOVL CX, -4(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_8through16:
+	MOVQ (CX), SI
+	MOVQ -8(CX)(DX*1), CX
+	MOVQ SI, (AX)
+	MOVQ CX, -8(AX)(DX*1)
+	JMP  emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(DX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+
+emit_lit_memmove_standalone_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(DX*1), X2
+	MOVOU -16(CX)(DX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+	JMP emit_literal_end_standalone
+
+memmove_midstandalone:
+	// genMemMoveShort
+	// margin: 0, min move: 30
+	CMPQ DX, $0x20
+	JBE  emit_lit_memmove_mid_standalone_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_standalone_memmove_move_33through64
+
+emit_lit_memmove_mid_standalone_memmove_move_17through32:
+	MOVOU (CX), X0
+	MOVOU -16(CX)(DX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+
+emit_lit_memmove_mid_standalone_memmove_move_33through64:
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(DX*1), X2
+	MOVOU -16(CX)(DX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+	JMP emit_literal_end_standalone
+
+memmove_long_standalone:
+	// genMemMoveLong
+	MOVOU (CX), X0
+	MOVOU 16(CX), X1
+	MOVOU -32(CX)(DX*1), X2
+	MOVOU -16(CX)(DX*1), X3
+	MOVQ  DX, DI
+	SHRQ  $0x05, DI
+	MOVQ  AX, SI
+	ANDL  $0x0000001f, SI
+	MOVQ  $0x00000040, R8
+	SUBQ  SI, R8
+	DECQ  DI
+	JA    emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+	LEAQ  -32(CX)(R8*1), SI
+	LEAQ  -32(AX)(R8*1), R9
+
+emit_lit_memmove_long_standalonelarge_big_loop_back:
+	MOVOU (SI), X4
+	MOVOU 16(SI), X5
+	MOVOA X4, (R9)
+	MOVOA X5, 16(R9)
+	ADDQ  $0x20, R9
+	ADDQ  $0x20, SI
+	ADDQ  $0x20, R8
+	DECQ  DI
+	JNA   emit_lit_memmove_long_standalonelarge_big_loop_back
+
+emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
+	MOVOU -32(CX)(R8*1), X4
+	MOVOU -16(CX)(R8*1), X5
+	MOVOA X4, -32(AX)(R8*1)
+	MOVOA X5, -16(AX)(R8*1)
+	ADDQ  $0x20, R8
+	CMPQ  DX, R8
+	JAE   emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+	JMP   emit_literal_end_standalone
+	JMP emit_literal_end_standalone
+
+emit_literal_end_standalone_skip:
+	XORQ BX, BX
+
+emit_literal_end_standalone:
+	MOVQ BX, ret+48(FP)
+	RET
+
+// func emitRepeat(dst []byte, length int) int
+TEXT ·emitRepeat(SB), NOSPLIT, $0-40
+	XORQ DX, DX
+	MOVQ dst_base+0(FP), AX
+	MOVQ length+24(FP), CX
+
+	// emitRepeat
+	LEAL -1(CX), BX
+	CMPL CX, $0x1d
+	JBE  repeat_one_standalone
+	LEAL -30(CX), BX
+	CMPL CX, $0x0000011e
+	JB   repeat_two_standalone
+	CMPL CX, $0x0001001e
+	JB   repeat_three_standalone
+	MOVB $0xfc, (AX)
+	MOVL BX, 1(AX)
+	ADDQ $0x04, DX
+	ADDQ $0x04, AX
+	JMP  gen_emit_repeat_end
+
+repeat_three_standalone:
+	MOVB $0xf4, (AX)
+	MOVW BX, 1(AX)
+	ADDQ $0x03, DX
+	ADDQ $0x03, AX
+	JMP  gen_emit_repeat_end
+
+repeat_two_standalone:
+	MOVB $0xec, (AX)
+	MOVB BL, 1(AX)
+	ADDQ $0x02, DX
+	ADDQ $0x02, AX
+	JMP  gen_emit_repeat_end
+
+repeat_one_standalone:
+	XORL BX, BX
+	LEAL -4(BX)(CX*8), BX
+	MOVB BL, (AX)
+	ADDQ $0x01, DX
+	ADDQ $0x01, AX
+
+gen_emit_repeat_end:
+	MOVQ DX, ret+32(FP)
+	RET
+
+// func emitCopy(dst []byte, offset int, length int) int
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ offset+24(FP), CX
+	MOVQ length+32(FP), DX
+
+	// emitCopy
+	CMPL CX, $0x0001003f
+	JBE  two_byte_offset_standalone
+
+	// emitCopy3
+	LEAL -4(DX), DX
+	LEAL -65536(CX), CX
+	SHLL $0x0b, CX
+	ADDL $0x07, CX
+	CMPL DX, $0x3c
+	JBE  emit_copy3_0_standalone_emit3
+	LEAL -60(DX), SI
+	CMPL DX, $0x0000013c
+	JB   emit_copy3_1_standalone_emit3
+	CMPL DX, $0x0001003c
+	JB   emit_copy3_2_standalone_emit3
+	ADDL $0x000007e0, CX
+	MOVL CX, (AX)
+	MOVL SI, 4(AX)
+	ADDQ $0x07, BX
+	ADDQ $0x07, AX
+	JMP  gen_emit_copy_end
+
+emit_copy3_2_standalone_emit3:
+	ADDL $0x000007c0, CX
+	MOVL CX, (AX)
+	MOVW SI, 4(AX)
+	ADDQ $0x06, BX
+	ADDQ $0x06, AX
+	JMP  gen_emit_copy_end
+
+emit_copy3_1_standalone_emit3:
+	ADDL $0x000007a0, CX
+	MOVL CX, (AX)
+	MOVB SI, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_copy_end
+
+emit_copy3_0_standalone_emit3:
+	SHLL $0x05, DX
+	ORL  DX, CX
+	MOVL CX, (AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_copy_end
+
+two_byte_offset_standalone:
+	CMPL CX, $0x00000400
+	JA   two_byte_standalone
+	CMPL DX, $0x00000013
+	JAE  emit_one_longer_standalone
+	LEAL -1(CX), CX
+	SHLL $0x06, CX
+	LEAL -15(CX)(DX*4), CX
+	MOVW CX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+emit_one_longer_standalone:
+	CMPL DX, $0x00000112
+	JAE  emit_copy1_repeat_standalone
+	LEAL -1(CX), CX
+	SHLL $0x06, CX
+	LEAL 61(CX), CX
+	MOVW CX, (AX)
+	LEAL -18(DX), CX
+	MOVB CL, 2(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_copy_end
+
+emit_copy1_repeat_standalone:
+	LEAL -1(CX), CX
+	SHLL $0x06, CX
+	LEAL 57(CX), CX
+	MOVW CX, (AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	SUBL $0x12, DX
+
+	// emitRepeat
+	LEAL -1(DX), CX
+	CMPL DX, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_standalone
+	LEAL -30(DX), CX
+	CMPL DX, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_standalone
+	CMPL DX, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_standalone
+	MOVB $0xfc, (AX)
+	MOVL CX, 1(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_copy_end
+
+repeat_three_emit_copy1_do_repeat_standalone:
+	MOVB $0xf4, (AX)
+	MOVW CX, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  gen_emit_copy_end
+
+repeat_two_emit_copy1_do_repeat_standalone:
+	MOVB $0xec, (AX)
+	MOVB CL, 1(AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  gen_emit_copy_end
+
+repeat_one_emit_copy1_do_repeat_standalone:
+	XORL CX, CX
+	LEAL -4(CX)(DX*8), CX
+	MOVB CL, (AX)
+	ADDQ $0x01, BX
+	ADDQ $0x01, AX
+	JMP  gen_emit_copy_end
+
+two_byte_standalone:
+	// emitCopy2
+	LEAL -64(CX), CX
+	LEAL -4(DX), DX
+	MOVW CX, 1(AX)
+	CMPL DX, $0x3c
+	JBE  emit_copy2_0_standalone_emit2
+	LEAL -60(DX), CX
+	CMPL DX, $0x0000013c
+	JB   emit_copy2_1_standalone_emit2
+	CMPL DX, $0x0001003c
+	JB   emit_copy2_2_standalone_emit2
+	MOVB $0xfe, (AX)
+	MOVL CX, 3(AX)
+	ADDQ $0x06, BX
+	ADDQ $0x06, AX
+	JMP  gen_emit_copy_end
+
+emit_copy2_2_standalone_emit2:
+	MOVB $0xfa, (AX)
+	MOVW CX, 3(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_copy_end
+
+emit_copy2_1_standalone_emit2:
+	MOVB $0xf6, (AX)
+	MOVB CL, 3(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  gen_emit_copy_end
+
+emit_copy2_0_standalone_emit2:
+	MOVL $0x00000002, CX
+	LEAL (CX)(DX*4), CX
+	MOVB CL, (AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+
+gen_emit_copy_end:
+	MOVQ BX, ret+40(FP)
+	RET
+
+// func emitCopyLits2(dst []byte, lits []byte, offset int, length int) int
+// Requires: CMOV
+TEXT ·emitCopyLits2(SB), NOSPLIT, $0-72
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ lits_len+32(FP), SI
+	MOVQ offset+48(FP), CX
+	MOVQ length+56(FP), DX
+	CMPL DX, $0x0b
+
+	// emitCopy2WithLits
+	XORQ    DI, DI
+	SUBL    $0x40, CX
+	LEAL    -11(DX), R8
+	LEAL    -4(DX), DX
+	MOVW    CX, 1(AX)
+	CMPL    DX, $0x07
+	CMOVLGE R8, DI
+	MOVQ    $0x00000007, CX
+	CMOVLLT DX, CX
+	LEAL    -1(SI)(CX*4), CX
+	MOVL    $0x00000003, DX
+	LEAL    (DX)(CX*8), CX
+	MOVB    CL, (AX)
+	ADDQ    $0x03, BX
+	ADDQ    $0x03, AX
+	MOVQ    lits_base+24(FP), CX
+
+	// genMemMoveVeryShort
+	CMPQ SI, $0x03
+	JE   standalone_emitcopy2_lits_move_3
+	JA   standalone_emitcopy2_lits_move_4
+	MOVB (CX), DL
+	MOVB -1(CX)(SI*1), CL
+	MOVB DL, (AX)
+	MOVB CL, -1(AX)(SI*1)
+	JMP  standalone_emitcopy2_lits_end
+
+standalone_emitcopy2_lits_move_3:
+	MOVW (CX), DX
+	MOVB 2(CX), CL
+	MOVW DX, (AX)
+	MOVB CL, 2(AX)
+	JMP  standalone_emitcopy2_lits_end
+
+standalone_emitcopy2_lits_move_4:
+	MOVL (CX), DX
+	MOVL DX, (AX)
+
+standalone_emitcopy2_lits_end:
+	ADDQ  SI, BX
+	ADDQ  SI, AX
+	TESTL DI, DI
+	JZ    standalone_emitcopy2_lits_done
+
+	// emitRepeat
+	LEAL -1(DI), CX
+	CMPL DI, $0x1d
+	JBE  repeat_one_standalone_emitcopy2_lits
+	LEAL -30(DI), CX
+	CMPL DI, $0x0000011e
+	JB   repeat_two_standalone_emitcopy2_lits
+	CMPL DI, $0x0001001e
+	JB   repeat_three_standalone_emitcopy2_lits
+	MOVB $0xfc, (AX)
+	MOVL CX, 1(AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+	JMP  standalone_emitcopy2_lits_done
+
+repeat_three_standalone_emitcopy2_lits:
+	MOVB $0xf4, (AX)
+	MOVW CX, 1(AX)
+	ADDQ $0x03, BX
+	ADDQ $0x03, AX
+	JMP  standalone_emitcopy2_lits_done
+
+repeat_two_standalone_emitcopy2_lits:
+	MOVB $0xec, (AX)
+	MOVB CL, 1(AX)
+	ADDQ $0x02, BX
+	ADDQ $0x02, AX
+	JMP  standalone_emitcopy2_lits_done
+
+repeat_one_standalone_emitcopy2_lits:
+	XORL CX, CX
+	LEAL -4(CX)(DI*8), CX
+	MOVB CL, (AX)
+	ADDQ $0x01, BX
+	ADDQ $0x01, AX
+
+standalone_emitcopy2_lits_done:
+	MOVQ BX, ret+64(FP)
+	RET
+
+// func emitCopyLits3(dst []byte, lits []byte, offset int, length int) int
+TEXT ·emitCopyLits3(SB), NOSPLIT, $0-72
+	XORQ BX, BX
+	MOVQ dst_base+0(FP), AX
+	MOVQ lits_len+32(FP), SI
+	MOVQ offset+48(FP), CX
+	MOVQ length+56(FP), DX
+
+	// emitCopy3
+	LEAL -4(DX), DX
+	LEAL -65536(CX), CX
+	SHLL $0x0b, CX
+	LEAL 7(CX)(SI*8), CX
+	CMPL DX, $0x3c
+	JBE  emit_copy3_0_standalone_lits
+	LEAL -60(DX), DI
+	CMPL DX, $0x0000013c
+	JB   emit_copy3_1_standalone_lits
+	CMPL DX, $0x0001003c
+	JB   emit_copy3_2_standalone_lits
+	ADDL $0x000007e0, CX
+	MOVL CX, (AX)
+	MOVL DI, 4(AX)
+	ADDQ $0x07, BX
+	ADDQ $0x07, AX
+	JMP  gen_emit_copy_lits_copylits
+
+emit_copy3_2_standalone_lits:
+	ADDL $0x000007c0, CX
+	MOVL CX, (AX)
+	MOVW DI, 4(AX)
+	ADDQ $0x06, BX
+	ADDQ $0x06, AX
+	JMP  gen_emit_copy_lits_copylits
+
+emit_copy3_1_standalone_lits:
+	ADDL $0x000007a0, CX
+	MOVL CX, (AX)
+	MOVB DI, 4(AX)
+	ADDQ $0x05, BX
+	ADDQ $0x05, AX
+	JMP  gen_emit_copy_lits_copylits
+
+emit_copy3_0_standalone_lits:
+	SHLL $0x05, DX
+	ORL  DX, CX
+	MOVL CX, (AX)
+	ADDQ $0x04, BX
+	ADDQ $0x04, AX
+
+gen_emit_copy_lits_copylits:
+	MOVQ lits_base+24(FP), CX
+
+	// genMemMoveVeryShort
+	CMPQ SI, $0x03
+	JE   standalone_emitcopy3_lits_move_3
+	MOVB (CX), DL
+	MOVB -1(CX)(SI*1), CL
+	MOVB DL, (AX)
+	MOVB CL, -1(AX)(SI*1)
+	JMP  standalone_emitcopy3_lits_end
+
+standalone_emitcopy3_lits_move_3:
+	MOVW (CX), DX
+	MOVB 2(CX), CL
+	MOVW DX, (AX)
+	MOVB CL, 2(AX)
+
+standalone_emitcopy3_lits_end:
+	ADDQ SI, BX
+	MOVQ BX, ret+64(FP)
+	RET
+
+// func matchLen(a []byte, b []byte) int
+// Requires: BMI
+TEXT ·matchLen(SB), NOSPLIT, $0-56
+	MOVQ a_base+0(FP), AX
+	MOVQ b_base+24(FP), CX
+	MOVQ a_len+8(FP), DX
+
+	// matchLen
+	XORL SI, SI
+	JMP  matchlen_loop_16_entry_standalone
+
+matchlen_loopback_16_standalone:
+	MOVQ (AX)(SI*1), BX
+	MOVQ 8(AX)(SI*1), DI
+	XORQ (CX)(SI*1), BX
+	JNZ  matchlen_bsf_8_standalone
+	XORQ 8(CX)(SI*1), DI
+	JNZ  matchlen_bsf_16standalone
+	LEAL -16(DX), DX
+	LEAL 16(SI), SI
+
+matchlen_loop_16_entry_standalone:
+	CMPL DX, $0x10
+	JAE  matchlen_loopback_16_standalone
+	JMP  matchlen_match8_standalone
+
+matchlen_bsf_16standalone:
+#ifdef GOAMD64_v3
+	TZCNTQ DI, DI
+
+#else
+	BSFQ DI, DI
+
+#endif
+	SARQ $0x03, DI
+	LEAL 8(SI)(DI*1), SI
+	JMP  gen_match_len_end
+
+matchlen_match8_standalone:
+	CMPL DX, $0x08
+	JB   matchlen_match4_standalone
+	MOVQ (AX)(SI*1), BX
+	XORQ (CX)(SI*1), BX
+	JNZ  matchlen_bsf_8_standalone
+	LEAL -8(DX), DX
+	LEAL 8(SI), SI
+	JMP  matchlen_match4_standalone
+
+matchlen_bsf_8_standalone:
+#ifdef GOAMD64_v3
+	TZCNTQ BX, BX
+
+#else
+	BSFQ BX, BX
+
+#endif
+	SARQ $0x03, BX
+	LEAL (SI)(BX*1), SI
+	JMP  gen_match_len_end
+
+matchlen_match4_standalone:
+	CMPL DX, $0x04
+	JB   matchlen_match2_standalone
+	MOVL (AX)(SI*1), BX
+	CMPL (CX)(SI*1), BX
+	JNE  matchlen_match2_standalone
+	LEAL -4(DX), DX
+	LEAL 4(SI), SI
+
+matchlen_match2_standalone:
+	CMPL DX, $0x01
+	JE   matchlen_match1_standalone
+	JB   gen_match_len_end
+	MOVW (AX)(SI*1), BX
+	CMPW (CX)(SI*1), BX
+	JNE  matchlen_match1_standalone
+	LEAL 2(SI), SI
+	SUBL $0x02, DX
+	JZ   gen_match_len_end
+
+matchlen_match1_standalone:
+	MOVB (AX)(SI*1), BL
+	CMPB (CX)(SI*1), BL
+	JNE  gen_match_len_end
+	LEAL 1(SI), SI
+
+gen_match_len_end:
+	MOVQ SI, ret+48(FP)
+	RET
+
+// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: CMOV, SSE2
+TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $8-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -12(AX)(CX*1), CX
+	MOVL $0x00000001, (SP)
+
+lz4_mz_loop:
+	CMPQ    DX, BX
+	JAE     lz4_mz_corrupt
+	CMPQ    AX, CX
+	JAE     lz4_mz_dstfull
+	MOVBQZX (DX), DI
+	MOVQ    DI, R8
+	MOVQ    DI, R9
+	ANDQ    $0x0f, R9
+	XORQ    R10, R10
+	SHRQ    $0x04, R8
+	CMPQ    DI, $0x50
+	CMOVQLT R8, R10
+	JLT     lz4_mz_ll_end
+	CMPQ    DI, $0xf0
+	JB      lz4_mz_ll_end
+
+lz4_mz_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4_mz_corrupt
+	MOVBQZX (DX), DI
+	ADDQ    DI, R8
+	CMPQ    DI, $0xff
+	JEQ     lz4_mz_ll_loop
+
+lz4_mz_ll_end:
+	LEAQ  (DX)(R8*1), DI
+	ADDQ  $0x04, R9
+	CMPQ  DI, BX
+	JAE   lz4_mz_corrupt
+	INCQ  DX
+	INCQ  DI
+	TESTQ R8, R8
+	JZ    lz4_mz_lits_done
+	TESTQ R10, R10
+	JNZ   lz4_mz_lits_done
+	LEAQ  (AX)(R8*1), R11
+	CMPQ  R11, CX
+	JAE   lz4_mz_dstfull
+
+	// emitLiteral
+	LEAL -1(R8), R11
+	CMPL R11, $0x1d
+	JB   one_byte_lz4_mz
+	SUBL $0x1d, R11
+	CMPL R11, $0x00000100
+	JB   two_bytes_lz4_mz
+	CMPL R11, $0x00010000
+	JB   three_bytes_lz4_mz
+	MOVL R11, R12
+	SHRL $0x10, R12
+	MOVB $0xf8, (AX)
+	MOVW R11, 1(AX)
+	MOVB R12, 3(AX)
+	ADDQ $0x04, AX
+	ADDL $0x1d, R11
+	JMP  memmove_long_lz4_mz
+
+three_bytes_lz4_mz:
+	MOVB $0xf0, (AX)
+	MOVW R11, 1(AX)
+	ADDQ $0x03, AX
+	ADDL $0x1d, R11
+	JMP  memmove_long_lz4_mz
+
+two_bytes_lz4_mz:
+	MOVB $0xe8, (AX)
+	MOVB R11, 1(AX)
+	ADDL $0x1d, R11
+	ADDQ $0x02, AX
+	CMPL R11, $0x40
+	JB   memmove_midlz4_mz
+	JMP  memmove_long_lz4_mz
+
+one_byte_lz4_mz:
+	SHLB $0x03, R11
+	MOVB R11, (AX)
+	ADDQ $0x01, AX
+	LEAQ (AX)(R8*1), R11
+	MOVL R8, R12
+
+	// genMemMoveShort
+	// margin: 0, min move: 1
+	CMPQ R12, $0x03
+	JB   emit_lit_memmove_lz4_mz_memmove_move_1or2
+	JE   emit_lit_memmove_lz4_mz_memmove_move_3
+	CMPQ R12, $0x08
+	JBE  emit_lit_memmove_lz4_mz_memmove_move_4through8
+	CMPQ R12, $0x10
+	JBE  emit_lit_memmove_lz4_mz_memmove_move_8through16
+	CMPQ R12, $0x20
+	JBE  emit_lit_memmove_lz4_mz_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4_mz_memmove_move_33through64
+
+emit_lit_memmove_lz4_mz_memmove_move_1or2:
+	MOVB (DX), R13
+	MOVB -1(DX)(R12*1), R14
+	MOVB R13, (AX)
+	MOVB R14, -1(AX)(R12*1)
+	JMP  memmove_end_copy_lz4_mz
+
+emit_lit_memmove_lz4_mz_memmove_move_3:
+	MOVW (DX), R13
+	MOVB 2(DX), R14
+	MOVW R13, (AX)
+	MOVB R14, 2(AX)
+	JMP  memmove_end_copy_lz4_mz
+
+emit_lit_memmove_lz4_mz_memmove_move_4through8:
+	MOVL (DX), R13
+	MOVL -4(DX)(R12*1), R14
+	MOVL R13, (AX)
+	MOVL R14, -4(AX)(R12*1)
+	JMP  memmove_end_copy_lz4_mz
+
+emit_lit_memmove_lz4_mz_memmove_move_8through16:
+	MOVQ (DX), R13
+	MOVQ -8(DX)(R12*1), R14
+	MOVQ R13, (AX)
+	MOVQ R14, -8(AX)(R12*1)
+	JMP  memmove_end_copy_lz4_mz
+
+emit_lit_memmove_lz4_mz_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R12*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R12*1)
+	JMP   memmove_end_copy_lz4_mz
+
+emit_lit_memmove_lz4_mz_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R12*1), X2
+	MOVOU -16(DX)(R12*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R12*1)
+	MOVOU X3, -16(AX)(R12*1)
+
+memmove_end_copy_lz4_mz:
+	MOVQ R11, AX
+	JMP  lz4_mz_lits_emit_done
+
+memmove_midlz4_mz:
+	LEAQ (AX)(R8*1), R11
+	MOVL R8, R12
+
+	// genMemMoveShort
+	// margin: 0, min move: 30
+	CMPQ R12, $0x20
+	JBE  emit_lit_memmove_mid_lz4_mz_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_lz4_mz_memmove_move_33through64
+
+emit_lit_memmove_mid_lz4_mz_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R12*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R12*1)
+	JMP   memmove_mid_end_copy_lz4_mz
+
+emit_lit_memmove_mid_lz4_mz_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R12*1), X2
+	MOVOU -16(DX)(R12*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R12*1)
+	MOVOU X3, -16(AX)(R12*1)
+
+memmove_mid_end_copy_lz4_mz:
+	MOVQ R11, AX
+	JMP  lz4_mz_lits_emit_done
+
+memmove_long_lz4_mz:
+	LEAQ (AX)(R8*1), R11
+	MOVL R8, R12
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R12*1), X2
+	MOVOU -16(DX)(R12*1), X3
+	MOVQ  R12, R14
+	SHRQ  $0x05, R14
+	MOVQ  AX, R13
+	ANDL  $0x0000001f, R13
+	MOVQ  $0x00000040, R15
+	SUBQ  R13, R15
+	DECQ  R14
+	JA    emit_lit_memmove_long_lz4_mzlarge_forward_sse_loop_32
+	LEAQ  -32(DX)(R15*1), R13
+	LEAQ  -32(AX)(R15*1), BP
+
+emit_lit_memmove_long_lz4_mzlarge_big_loop_back:
+	MOVOU (R13), X4
+	MOVOU 16(R13), X5
+	MOVOA X4, (BP)
+	MOVOA X5, 16(BP)
+	ADDQ  $0x20, BP
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R15
+	DECQ  R14
+	JNA   emit_lit_memmove_long_lz4_mzlarge_big_loop_back
+
+emit_lit_memmove_long_lz4_mzlarge_forward_sse_loop_32:
+	MOVOU -32(DX)(R15*1), X4
+	MOVOU -16(DX)(R15*1), X5
+	MOVOA X4, -32(AX)(R15*1)
+	MOVOA X5, -16(AX)(R15*1)
+	ADDQ  $0x20, R15
+	CMPQ  R12, R15
+	JAE   emit_lit_memmove_long_lz4_mzlarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R12*1)
+	MOVOU X3, -16(AX)(R12*1)
+	MOVQ  R11, AX
+
+lz4_mz_lits_emit_done:
+lz4_mz_lits_done:
+	ADDQ  R8, SI
+	MOVQ  DI, R8
+	MOVQ  DX, DI
+	MOVQ  R8, DX
+	CMPQ  DX, BX
+	JNE   lz4_mz_match
+	CMPQ  R9, $0x04
+	JNE   lz4_mz_corrupt
+	TESTQ R10, R10
+	JNZ   lz4_mz_emit_final
+	JMP   lz4_mz_done
+
+lz4_mz_match:
+	ADDQ    $0x02, DX
+	CMPQ    DX, BX
+	JAE     lz4_mz_corrupt
+	MOVWQZX -2(DX), R8
+	TESTQ   R8, R8
+	JZ      lz4_mz_corrupt
+	CMPQ    R8, SI
+	JA      lz4_mz_corrupt
+	CMPQ    R9, $0x13
+	JNE     lz4_mz_ml_done
+
+lz4_mz_ml_loop:
+	MOVBQZX (DX), R11
+	INCQ    DX
+	ADDQ    R11, R9
+	CMPQ    DX, BX
+	JAE     lz4_mz_corrupt
+	CMPQ    R11, $0xff
+	JEQ     lz4_mz_ml_loop
+
+lz4_mz_ml_done:
+	ADDQ  R9, SI
+	TESTQ R10, R10
+	JNZ   lz4_mz_dofuse
+	CMPQ  (SP), R8
+	JNE   lz4_mz_docopy
+
+	// emitRepeat
+	LEAL -1(R9), DI
+	CMPL R9, $0x1d
+	JBE  repeat_one_lz4_mz
+	LEAL -30(R9), DI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_lz4_mz
+	CMPL R9, $0x0001001e
+	JB   repeat_three_lz4_mz
+	MOVB $0xfc, (AX)
+	MOVL DI, 1(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_mz_loop
+
+repeat_three_lz4_mz:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_mz_loop
+
+repeat_two_lz4_mz:
+	MOVB $0xec, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	JMP  lz4_mz_loop
+
+repeat_one_lz4_mz:
+	XORL DI, DI
+	LEAL -4(DI)(R9*8), DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+	JMP  lz4_mz_loop
+
+lz4_mz_dofuse:
+	MOVQ R8, (SP)
+	CMPQ R8, $0x40
+	JB   lz4_mz_doemitcopy
+
+	// emitCopy2WithLits
+	XORQ    R11, R11
+	SUBL    $0x40, R8
+	LEAL    -11(R9), R12
+	LEAL    -4(R9), R9
+	MOVW    R8, 1(AX)
+	CMPL    R9, $0x07
+	CMOVLGE R12, R11
+	MOVQ    $0x00000007, R8
+	CMOVLLT R9, R8
+	LEAL    -1(R10)(R8*4), R8
+	MOVL    $0x00000003, R9
+	LEAL    (R9)(R8*8), R8
+	MOVB    R8, (AX)
+	ADDQ    $0x03, AX
+	MOVL    (DI), DI
+	MOVL    DI, (AX)
+	ADDQ    R10, AX
+	TESTL   R11, R11
+	JZ      lz4_mz_loop
+
+	// emitRepeat
+	LEAL -1(R11), DI
+	CMPL R11, $0x1d
+	JBE  repeat_one_fused_emitrep_lz4_mz_
+	LEAL -30(R11), DI
+	CMPL R11, $0x0000011e
+	JB   repeat_two_fused_emitrep_lz4_mz_
+	CMPL R11, $0x0001001e
+	JB   repeat_three_fused_emitrep_lz4_mz_
+	MOVB $0xfc, (AX)
+	MOVL DI, 1(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_mz_loop
+
+repeat_three_fused_emitrep_lz4_mz_:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_mz_loop
+
+repeat_two_fused_emitrep_lz4_mz_:
+	MOVB $0xec, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	JMP  lz4_mz_loop
+
+repeat_one_fused_emitrep_lz4_mz_:
+	XORL DI, DI
+	LEAL -4(DI)(R11*8), DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+	JMP  lz4_mz_loop
+
+lz4_mz_doemitcopy:
+	// emitLiteral
+	LEAL -1(R10), R11
+	CMPL R11, $0x1d
+	JB   one_byte_lz4_mz_emitcopy
+	SUBL $0x1d, R11
+	CMPL R11, $0x00000100
+	JB   two_bytes_lz4_mz_emitcopy
+	CMPL R11, $0x00010000
+	JB   three_bytes_lz4_mz_emitcopy
+	MOVL R11, R12
+	SHRL $0x10, R12
+	MOVB $0xf8, (AX)
+	MOVW R11, 1(AX)
+	MOVB R12, 3(AX)
+	ADDQ $0x04, AX
+	ADDL $0x1d, R11
+	JMP  memmove_long_lz4_mz_emitcopy
+
+three_bytes_lz4_mz_emitcopy:
+	MOVB $0xf0, (AX)
+	MOVW R11, 1(AX)
+	ADDQ $0x03, AX
+	ADDL $0x1d, R11
+	JMP  memmove_long_lz4_mz_emitcopy
+
+two_bytes_lz4_mz_emitcopy:
+	MOVB $0xe8, (AX)
+	MOVB R11, 1(AX)
+	ADDL $0x1d, R11
+	ADDQ $0x02, AX
+	CMPL R11, $0x40
+	JB   memmove_midlz4_mz_emitcopy
+	JMP  memmove_long_lz4_mz_emitcopy
+
+one_byte_lz4_mz_emitcopy:
+	SHLB $0x03, R11
+	MOVB R11, (AX)
+	ADDQ $0x01, AX
+	LEAQ (AX)(R10*1), R11
+
+	// genMemMoveShort
+	// margin: 0, min move: 1
+	CMPQ R10, $0x03
+	JB   emit_lit_memmove_lz4_mz_emitcopy_memmove_move_1or2
+	JE   emit_lit_memmove_lz4_mz_emitcopy_memmove_move_3
+	CMPQ R10, $0x08
+	JBE  emit_lit_memmove_lz4_mz_emitcopy_memmove_move_4through8
+	CMPQ R10, $0x10
+	JBE  emit_lit_memmove_lz4_mz_emitcopy_memmove_move_8through16
+	CMPQ R10, $0x20
+	JBE  emit_lit_memmove_lz4_mz_emitcopy_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4_mz_emitcopy_memmove_move_33through64
+
+emit_lit_memmove_lz4_mz_emitcopy_memmove_move_1or2:
+	MOVB (DI), R12
+	MOVB -1(DI)(R10*1), DI
+	MOVB R12, (AX)
+	MOVB DI, -1(AX)(R10*1)
+	JMP  memmove_end_copy_lz4_mz_emitcopy
+
+emit_lit_memmove_lz4_mz_emitcopy_memmove_move_3:
+	MOVW (DI), R12
+	MOVB 2(DI), DI
+	MOVW R12, (AX)
+	MOVB DI, 2(AX)
+	JMP  memmove_end_copy_lz4_mz_emitcopy
+
+emit_lit_memmove_lz4_mz_emitcopy_memmove_move_4through8:
+	MOVL (DI), R12
+	MOVL -4(DI)(R10*1), DI
+	MOVL R12, (AX)
+	MOVL DI, -4(AX)(R10*1)
+	JMP  memmove_end_copy_lz4_mz_emitcopy
+
+emit_lit_memmove_lz4_mz_emitcopy_memmove_move_8through16:
+	MOVQ (DI), R12
+	MOVQ -8(DI)(R10*1), DI
+	MOVQ R12, (AX)
+	MOVQ DI, -8(AX)(R10*1)
+	JMP  memmove_end_copy_lz4_mz_emitcopy
+
+emit_lit_memmove_lz4_mz_emitcopy_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R10*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R10*1)
+	JMP   memmove_end_copy_lz4_mz_emitcopy
+
+emit_lit_memmove_lz4_mz_emitcopy_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R10*1), X2
+	MOVOU -16(DI)(R10*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R10*1)
+	MOVOU X3, -16(AX)(R10*1)
+
+memmove_end_copy_lz4_mz_emitcopy:
+	MOVQ R11, AX
+	JMP  lz4_mz__emit_done
+
+memmove_midlz4_mz_emitcopy:
+	LEAQ (AX)(R10*1), R11
+
+	// genMemMoveShort
+	// margin: 0, min move: 30
+	CMPQ R10, $0x20
+	JBE  emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_33through64
+
+emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(R10*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R10*1)
+	JMP   memmove_mid_end_copy_lz4_mz_emitcopy
+
+emit_lit_memmove_mid_lz4_mz_emitcopy_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R10*1), X2
+	MOVOU -16(DI)(R10*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R10*1)
+	MOVOU X3, -16(AX)(R10*1)
+
+memmove_mid_end_copy_lz4_mz_emitcopy:
+	MOVQ R11, AX
+	JMP  lz4_mz__emit_done
+
+memmove_long_lz4_mz_emitcopy:
+	LEAQ (AX)(R10*1), R11
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(R10*1), X2
+	MOVOU -16(DI)(R10*1), X3
+	MOVQ  R10, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R12
+	ANDL  $0x0000001f, R12
+	MOVQ  $0x00000040, R14
+	SUBQ  R12, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_lz4_mz_emitcopylarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R14*1), R12
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_lz4_mz_emitcopylarge_big_loop_back:
+	MOVOU (R12), X4
+	MOVOU 16(R12), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_lz4_mz_emitcopylarge_big_loop_back
+
+emit_lit_memmove_long_lz4_mz_emitcopylarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R14*1), X4
+	MOVOU -16(DI)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R10, R14
+	JAE   emit_lit_memmove_long_lz4_mz_emitcopylarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R10*1)
+	MOVOU X3, -16(AX)(R10*1)
+	MOVQ  R11, AX
+
+lz4_mz__emit_done:
+	// emitCopy
+	CMPL R8, $0x00000400
+	JA   two_byte_lz4_mz__lz4_mz_short_
+	CMPL R9, $0x00000013
+	JAE  emit_one_longer_lz4_mz__lz4_mz_short_
+	LEAL -1(R8), DI
+	SHLL $0x06, DI
+	LEAL -15(DI)(R9*4), DI
+	MOVW DI, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_mz_loop
+
+emit_one_longer_lz4_mz__lz4_mz_short_:
+	CMPL R9, $0x00000112
+	JAE  emit_copy1_repeat_lz4_mz__lz4_mz_short_
+	LEAL -1(R8), DI
+	SHLL $0x06, DI
+	LEAL 61(DI), DI
+	MOVW DI, (AX)
+	LEAL -18(R9), DI
+	MOVB DI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_mz_loop
+
+emit_copy1_repeat_lz4_mz__lz4_mz_short_:
+	LEAL -1(R8), DI
+	SHLL $0x06, DI
+	LEAL 57(DI), DI
+	MOVW DI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x12, R9
+
+	// emitRepeat
+	LEAL -1(R9), DI
+	CMPL R9, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_
+	LEAL -30(R9), DI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_
+	CMPL R9, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_
+	MOVB $0xfc, (AX)
+	MOVL DI, 1(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_mz_loop
+
+repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_mz_loop
+
+repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_:
+	MOVB $0xec, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	JMP  lz4_mz_loop
+
+repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz_short_:
+	XORL DI, DI
+	LEAL -4(DI)(R9*8), DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+	JMP  lz4_mz_loop
+
+two_byte_lz4_mz__lz4_mz_short_:
+	// emitCopy2
+	LEAL -64(R8), R8
+	LEAL -4(R9), R9
+	MOVW R8, 1(AX)
+	CMPL R9, $0x3c
+	JBE  emit_copy2_0_lz4_mz__lz4_mz_short__emit2
+	LEAL -60(R9), DI
+	CMPL R9, $0x0000013c
+	JB   emit_copy2_1_lz4_mz__lz4_mz_short__emit2
+	CMPL R9, $0x0001003c
+	JB   emit_copy2_2_lz4_mz__lz4_mz_short__emit2
+	MOVB $0xfe, (AX)
+	MOVL DI, 3(AX)
+	ADDQ $0x06, AX
+	JMP  lz4_mz_loop
+
+emit_copy2_2_lz4_mz__lz4_mz_short__emit2:
+	MOVB $0xfa, (AX)
+	MOVW DI, 3(AX)
+	ADDQ $0x05, AX
+	JMP  lz4_mz_loop
+
+emit_copy2_1_lz4_mz__lz4_mz_short__emit2:
+	MOVB $0xf6, (AX)
+	MOVB DI, 3(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_mz_loop
+
+emit_copy2_0_lz4_mz__lz4_mz_short__emit2:
+	MOVL $0x00000002, DI
+	LEAL (DI)(R9*4), DI
+	MOVB DI, (AX)
+	ADDQ $0x03, AX
+	JMP  lz4_mz_loop
+
+lz4_mz_docopy:
+	MOVQ R8, (SP)
+
+	// emitCopy
+	CMPL R8, $0x00000400
+	JA   two_byte_lz4_mz__lz4_mz
+	CMPL R9, $0x00000013
+	JAE  emit_one_longer_lz4_mz__lz4_mz
+	LEAL -1(R8), DI
+	SHLL $0x06, DI
+	LEAL -15(DI)(R9*4), DI
+	MOVW DI, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_mz_loop
+
+emit_one_longer_lz4_mz__lz4_mz:
+	CMPL R9, $0x00000112
+	JAE  emit_copy1_repeat_lz4_mz__lz4_mz
+	LEAL -1(R8), DI
+	SHLL $0x06, DI
+	LEAL 61(DI), DI
+	MOVW DI, (AX)
+	LEAL -18(R9), DI
+	MOVB DI, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_mz_loop
+
+emit_copy1_repeat_lz4_mz__lz4_mz:
+	LEAL -1(R8), DI
+	SHLL $0x06, DI
+	LEAL 57(DI), DI
+	MOVW DI, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x12, R9
+
+	// emitRepeat
+	LEAL -1(R9), DI
+	CMPL R9, $0x1d
+	JBE  repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz
+	LEAL -30(R9), DI
+	CMPL R9, $0x0000011e
+	JB   repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz
+	CMPL R9, $0x0001001e
+	JB   repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz
+	MOVB $0xfc, (AX)
+	MOVL DI, 1(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_mz_loop
+
+repeat_three_emit_copy1_do_repeat_lz4_mz__lz4_mz:
+	MOVB $0xf4, (AX)
+	MOVW DI, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_mz_loop
+
+repeat_two_emit_copy1_do_repeat_lz4_mz__lz4_mz:
+	MOVB $0xec, (AX)
+	MOVB DI, 1(AX)
+	ADDQ $0x02, AX
+	JMP  lz4_mz_loop
+
+repeat_one_emit_copy1_do_repeat_lz4_mz__lz4_mz:
+	XORL DI, DI
+	LEAL -4(DI)(R9*8), DI
+	MOVB DI, (AX)
+	ADDQ $0x01, AX
+	JMP  lz4_mz_loop
+
+two_byte_lz4_mz__lz4_mz:
+	// emitCopy2
+	LEAL -64(R8), R8
+	LEAL -4(R9), R9
+	MOVW R8, 1(AX)
+	CMPL R9, $0x3c
+	JBE  emit_copy2_0_lz4_mz__lz4_mz_emit2
+	LEAL -60(R9), DI
+	CMPL R9, $0x0000013c
+	JB   emit_copy2_1_lz4_mz__lz4_mz_emit2
+	CMPL R9, $0x0001003c
+	JB   emit_copy2_2_lz4_mz__lz4_mz_emit2
+	MOVB $0xfe, (AX)
+	MOVL DI, 3(AX)
+	ADDQ $0x06, AX
+	JMP  lz4_mz_loop
+
+emit_copy2_2_lz4_mz__lz4_mz_emit2:
+	MOVB $0xfa, (AX)
+	MOVW DI, 3(AX)
+	ADDQ $0x05, AX
+	JMP  lz4_mz_loop
+
+emit_copy2_1_lz4_mz__lz4_mz_emit2:
+	MOVB $0xf6, (AX)
+	MOVB DI, 3(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_mz_loop
+
+emit_copy2_0_lz4_mz__lz4_mz_emit2:
+	MOVL $0x00000002, DI
+	LEAL (DI)(R9*4), DI
+	MOVB DI, (AX)
+	ADDQ $0x03, AX
+	JMP  lz4_mz_loop
+
+lz4_mz_emit_final:
+	// emitLiteral
+	LEAL -1(R10), CX
+	CMPL CX, $0x1d
+	JB   one_byte_lz4_mz_emit_final
+	SUBL $0x1d, CX
+	CMPL CX, $0x00000100
+	JB   two_bytes_lz4_mz_emit_final
+	CMPL CX, $0x00010000
+	JB   three_bytes_lz4_mz_emit_final
+	MOVL CX, DX
+	SHRL $0x10, DX
+	MOVB $0xf8, (AX)
+	MOVW CX, 1(AX)
+	MOVB DL, 3(AX)
+	ADDQ $0x04, AX
+	ADDL $0x1d, CX
+	JMP  memmove_long_lz4_mz_emit_final
+
+three_bytes_lz4_mz_emit_final:
+	MOVB $0xf0, (AX)
+	MOVW CX, 1(AX)
+	ADDQ $0x03, AX
+	ADDL $0x1d, CX
+	JMP  memmove_long_lz4_mz_emit_final
+
+two_bytes_lz4_mz_emit_final:
+	MOVB $0xe8, (AX)
+	MOVB CL, 1(AX)
+	ADDL $0x1d, CX
+	ADDQ $0x02, AX
+	CMPL CX, $0x40
+	JB   memmove_midlz4_mz_emit_final
+	JMP  memmove_long_lz4_mz_emit_final
+
+one_byte_lz4_mz_emit_final:
+	SHLB $0x03, CL
+	MOVB CL, (AX)
+	ADDQ $0x01, AX
+	LEAQ (AX)(R10*1), CX
+	MOVL R10, DX
+
+	// genMemMoveShort
+	// margin: 0, min move: 1
+	CMPQ DX, $0x03
+	JB   emit_lit_memmove_lz4_mz_emit_final_memmove_move_1or2
+	JE   emit_lit_memmove_lz4_mz_emit_final_memmove_move_3
+	CMPQ DX, $0x08
+	JBE  emit_lit_memmove_lz4_mz_emit_final_memmove_move_4through8
+	CMPQ DX, $0x10
+	JBE  emit_lit_memmove_lz4_mz_emit_final_memmove_move_8through16
+	CMPQ DX, $0x20
+	JBE  emit_lit_memmove_lz4_mz_emit_final_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4_mz_emit_final_memmove_move_33through64
+
+emit_lit_memmove_lz4_mz_emit_final_memmove_move_1or2:
+	MOVB (DI), BL
+	MOVB -1(DI)(DX*1), DI
+	MOVB BL, (AX)
+	MOVB DI, -1(AX)(DX*1)
+	JMP  memmove_end_copy_lz4_mz_emit_final
+
+emit_lit_memmove_lz4_mz_emit_final_memmove_move_3:
+	MOVW (DI), BX
+	MOVB 2(DI), DI
+	MOVW BX, (AX)
+	MOVB DI, 2(AX)
+	JMP  memmove_end_copy_lz4_mz_emit_final
+
+emit_lit_memmove_lz4_mz_emit_final_memmove_move_4through8:
+	MOVL (DI), BX
+	MOVL -4(DI)(DX*1), DI
+	MOVL BX, (AX)
+	MOVL DI, -4(AX)(DX*1)
+	JMP  memmove_end_copy_lz4_mz_emit_final
+
+emit_lit_memmove_lz4_mz_emit_final_memmove_move_8through16:
+	MOVQ (DI), BX
+	MOVQ -8(DI)(DX*1), DI
+	MOVQ BX, (AX)
+	MOVQ DI, -8(AX)(DX*1)
+	JMP  memmove_end_copy_lz4_mz_emit_final
+
+emit_lit_memmove_lz4_mz_emit_final_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(DX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DX*1)
+	JMP   memmove_end_copy_lz4_mz_emit_final
+
+emit_lit_memmove_lz4_mz_emit_final_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(DX*1), X2
+	MOVOU -16(DI)(DX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+
+memmove_end_copy_lz4_mz_emit_final:
+	MOVQ CX, AX
+	JMP  lz4_mz_done
+
+memmove_midlz4_mz_emit_final:
+	LEAQ (AX)(R10*1), CX
+	MOVL R10, DX
+
+	// genMemMoveShort
+	// margin: 0, min move: 30
+	CMPQ DX, $0x20
+	JBE  emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_17through32
+	JMP  emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_33through64
+
+emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_17through32:
+	MOVOU (DI), X0
+	MOVOU -16(DI)(DX*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(DX*1)
+	JMP   memmove_mid_end_copy_lz4_mz_emit_final
+
+emit_lit_memmove_mid_lz4_mz_emit_final_memmove_move_33through64:
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(DX*1), X2
+	MOVOU -16(DI)(DX*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+
+memmove_mid_end_copy_lz4_mz_emit_final:
+	MOVQ CX, AX
+	JMP  lz4_mz_done
+
+memmove_long_lz4_mz_emit_final:
+	LEAQ (AX)(R10*1), CX
+	MOVL R10, DX
+
+	// genMemMoveLong
+	MOVOU (DI), X0
+	MOVOU 16(DI), X1
+	MOVOU -32(DI)(DX*1), X2
+	MOVOU -16(DI)(DX*1), X3
+	MOVQ  DX, R8
+	SHRQ  $0x05, R8
+	MOVQ  AX, BX
+	ANDL  $0x0000001f, BX
+	MOVQ  $0x00000040, R9
+	SUBQ  BX, R9
+	DECQ  R8
+	JA    emit_lit_memmove_long_lz4_mz_emit_finallarge_forward_sse_loop_32
+	LEAQ  -32(DI)(R9*1), BX
+	LEAQ  -32(AX)(R9*1), R10
+
+emit_lit_memmove_long_lz4_mz_emit_finallarge_big_loop_back:
+	MOVOU (BX), X4
+	MOVOU 16(BX), X5
+	MOVOA X4, (R10)
+	MOVOA X5, 16(R10)
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, BX
+	ADDQ  $0x20, R9
+	DECQ  R8
+	JNA   emit_lit_memmove_long_lz4_mz_emit_finallarge_big_loop_back
+
+emit_lit_memmove_long_lz4_mz_emit_finallarge_forward_sse_loop_32:
+	MOVOU -32(DI)(R9*1), X4
+	MOVOU -16(DI)(R9*1), X5
+	MOVOA X4, -32(AX)(R9*1)
+	MOVOA X5, -16(AX)(R9*1)
+	ADDQ  $0x20, R9
+	CMPQ  DX, R9
+	JAE   emit_lit_memmove_long_lz4_mz_emit_finallarge_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(DX*1)
+	MOVOU X3, -16(AX)(DX*1)
+	MOVQ  CX, AX
+
+lz4_mz_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4_mz_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4_mz_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+// func decodeBlockAsm(dst []byte, src []byte) int
+// Requires: CMOV, SSE2
+TEXT ·decodeBlockAsm(SB), $8-56
+	MOVQ    dst_base+0(FP), AX
+	MOVQ    dst_len+8(FP), CX
+	MOVQ    src_base+24(FP), DX
+	MOVQ    src_len+32(FP), BX
+	MOVQ    AX, SI
+	XORQ    DI, DI
+	MOVQ    DX, R8
+	MOVQ    $0x00000001, R9
+	LEAQ    (AX)(CX*1), AX
+	LEAQ    (DX)(BX*1), CX
+	LEAQ    -20(CX), DX
+	LEAQ    -20(AX), BX
+	CMPQ    R8, DX
+	JAE     decodeBlockAsm_fast_end_copy
+	MOVBQZX (R8), R10
+	MOVQ    R10, R11
+	SHRQ    $0x02, R11
+
+decodeBlockAsm_fast_loop_nofetch:
+	CMPQ SI, BX
+	JAE  decodeBlockAsm_fast_end_copy
+	ANDQ $0x03, R10
+	JNZ  decodeBlockAsm_fast_copy
+
+decodeBlockAsm_fast_lits:
+	MOVL R11, R12
+	SHRL $0x01, R12
+	CMPL R12, $0x1d
+	JB   decodeBlockAsm_fast_lit_0
+	JEQ  decodeBlockAsm_fast_lit_1
+	CMPL R12, $0x1e
+	JEQ  decodeBlockAsm_fast_lit_2
+	JMP  decodeBlockAsm_fast_lit_3
+
+decodeBlockAsm_fast_lit_0:
+	INCQ R8
+	INCL R12
+	LEAQ (SI)(R12*1), R10
+	CMPQ R10, AX
+	JA   corrupt
+	BTL  $0x00, R11
+	JC   decodeBlockAsm_fast_copy_exec_short
+	LEAQ (R8)(R12*1), R10
+	CMPQ R10, CX
+	JA   corrupt
+
+	// genMemMoveShort
+	// margin: 19, min move: 1
+	CMPQ R12, $0x10
+	JBE  decodeBlockAsm_fast_lit_0_copy_memmove_move_8through16
+	CMPQ R12, $0x20
+	JBE  decodeBlockAsm_fast_lit_0_copy_memmove_move_17through32
+	JMP  decodeBlockAsm_fast_lit_0_copy_memmove_move_33through64
+
+decodeBlockAsm_fast_lit_0_copy_memmove_move_8through16:
+	MOVOU (R8), X0
+	MOVOU X0, (SI)
+	JMP   decodeBlockAsm_fast_litcopy_done
+
+decodeBlockAsm_fast_lit_0_copy_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(R12*1), X1
+	MOVOU X0, (SI)
+	MOVOU X1, -16(SI)(R12*1)
+	JMP   decodeBlockAsm_fast_litcopy_done
+
+decodeBlockAsm_fast_lit_0_copy_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(R12*1), X2
+	MOVOU -16(R8)(R12*1), X3
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(R12*1)
+	MOVOU X3, -16(SI)(R12*1)
+	JMP   decodeBlockAsm_fast_litcopy_done
+
+decodeBlockAsm_fast_lit_1:
+	MOVBQZX 1(R8), R12
+	ADDQ    $0x02, R8
+	JMP     decodeBlockAsm_fast_litcopy_long
+
+decodeBlockAsm_fast_lit_2:
+	MOVWQZX 1(R8), R12
+	ADDQ    $0x03, R8
+	JMP     decodeBlockAsm_fast_litcopy_long
+
+decodeBlockAsm_fast_lit_3:
+	MOVL (R8), R12
+	ADDQ $0x04, R8
+	SHRL $0x08, R12
+
+decodeBlockAsm_fast_litcopy_long:
+	LEAQ 30(R12), R12
+	LEAQ (SI)(R12*1), R10
+	CMPQ R10, AX
+	JA   corrupt
+	BTL  $0x00, R11
+	JC   decodeBlockAsm_fast_copy_exec
+	LEAQ (R8)(R12*1), R10
+	CMPQ R10, CX
+	JA   corrupt
+	CMPL R12, $0x40
+	JBE  decodeBlockAsm_fast_litcopy_short_reduced
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(R12*1), X2
+	MOVOU -16(R8)(R12*1), X3
+	MOVQ  R12, R11
+	SHRQ  $0x05, R11
+	MOVQ  SI, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R13
+	SUBQ  R10, R13
+	DECQ  R11
+	JA    decodeBlockAsm_fast_litcopy_longlarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R13*1), R10
+	LEAQ  -32(SI)(R13*1), R14
+
+decodeBlockAsm_fast_litcopy_longlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R14)
+	MOVOA X5, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R13
+	DECQ  R11
+	JNA   decodeBlockAsm_fast_litcopy_longlarge_big_loop_back
+
+decodeBlockAsm_fast_litcopy_longlarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R13*1), X4
+	MOVOU -16(R8)(R13*1), X5
+	MOVOA X4, -32(SI)(R13*1)
+	MOVOA X5, -16(SI)(R13*1)
+	ADDQ  $0x20, R13
+	CMPQ  R12, R13
+	JAE   decodeBlockAsm_fast_litcopy_longlarge_forward_sse_loop_32
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(R12*1)
+	MOVOU X3, -16(SI)(R12*1)
+	JMP   decodeBlockAsm_fast_litcopy_done
+
+decodeBlockAsm_fast_litcopy_short_reduced:
+	// genMemMoveShort
+	// margin: 16, min move: 30
+	CMPQ R12, $0x20
+	JBE  decodeBlockAsm_fast_lit_longer_copy_memmove_move_17through32
+	JMP  decodeBlockAsm_fast_lit_longer_copy_memmove_move_33through64
+
+decodeBlockAsm_fast_lit_longer_copy_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(R12*1), X1
+	MOVOU X0, (SI)
+	MOVOU X1, -16(SI)(R12*1)
+	JMP   decodeBlockAsm_fast_litcopy_done
+
+decodeBlockAsm_fast_lit_longer_copy_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(R12*1), X2
+	MOVOU -16(R8)(R12*1), X3
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(R12*1)
+	MOVOU X3, -16(SI)(R12*1)
+
+decodeBlockAsm_fast_litcopy_done:
+	ADDQ    R12, R8
+	ADDQ    R12, SI
+	ADDQ    R12, DI
+	CMPQ    R8, DX
+	JAE     decodeBlockAsm_fast_end_done
+	MOVBQZX (R8), R10
+	MOVQ    R10, R11
+	SHRQ    $0x02, R11
+	CMPQ    SI, BX
+	JAE     decodeBlockAsm_fast_end_done
+	ANDQ    $0x03, R10
+	JZ      decodeBlockAsm_fast_lits
+
+decodeBlockAsm_fast_copy:
+	MOVL (R8), R13
+	CMPL R10, $0x02
+	JB   decodeBlockAsm_fast_copy_1
+	JEQ  decodeBlockAsm_fast_copy_2
+	JMP  decodeBlockAsm_fast_copy_3
+
+decodeBlockAsm_fast_copy_1:
+	MOVWQZX R13, R9
+	ADDQ    $0x02, R8
+	MOVQ    R11, R12
+	ANDL    $0x0f, R12
+	SHRL    $0x06, R9
+	INCL    R9
+	SHRL    $0x10, R13
+	LEAQ    1(R8), R10
+	MOVBLZX R13, R11
+	ADDL    $0x04, R12
+	LEAL    18(R11), R11
+	CMPL    R12, $0x13
+	CMOVLEQ R11, R12
+	CMOVQEQ R10, R8
+	JMP     decodeBlockAsm_fast_copy_exec
+
+decodeBlockAsm_fast_copy_2:
+	MOVQ    R11, R12
+	CMPL    R11, $0x3d
+	JB      decodeBlockAsm_fast_copy_2_0_extra
+	JEQ     decodeBlockAsm_fast_copy_2_1_extra
+	CMPL    R12, $0x3f
+	JB      decodeBlockAsm_fast_copy_2_2_extra
+	MOVWQZX 1(R8), R9
+	MOVL    2(R8), R12
+	ADDQ    $0x06, R8
+	SHRL    $0x08, R12
+	LEAL    64(R12), R12
+	ADDQ    $0x40, R9
+	JMP     decodeBlockAsm_fast_copy_exec_long_long
+
+decodeBlockAsm_fast_copy_2_2_extra:
+	MOVWQZX 1(R8), R9
+	MOVWLZX 3(R8), R12
+	ADDQ    $0x05, R8
+	LEAL    64(R12), R12
+	ADDQ    $0x40, R9
+	JMP     decodeBlockAsm_fast_copy_exec_long_long
+
+decodeBlockAsm_fast_copy_2_1_extra:
+	MOVL    R13, R12
+	SHRL    $0x08, R13
+	SHRL    $0x18, R12
+	MOVWQZX R13, R9
+	ADDQ    $0x04, R8
+	LEAL    64(R12), R12
+	ADDQ    $0x40, R9
+	JMP     decodeBlockAsm_fast_copy_exec_long_long
+
+decodeBlockAsm_fast_copy_2_0_extra:
+	SHRL    $0x08, R13
+	MOVWQZX R13, R9
+	LEAQ    3(R8), R8
+	LEAL    4(R12), R12
+	ADDQ    $0x40, R9
+	JMP     decodeBlockAsm_fast_copy_short_no_ol
+
+decodeBlockAsm_fast_copy_3:
+	MOVL    R13, R9
+	ADDQ    $0x04, R8
+	MOVQ    R11, R10
+	SHRQ    $0x01, R10
+	ANDQ    $0x03, R10
+	BTL     $0x00, R11
+	JC      decodeBlockAsm_fast_copy3_read
+	SHRL    $0x03, R11
+	ANDL    $0x07, R11
+	LEAL    4(R11), R12
+	SHRL    $0x08, R13
+	MOVWQZX R13, R9
+	DECQ    R8
+	INCQ    R10
+	MOVL    (R8), R11
+	MOVL    R11, (SI)
+	ADDQ    $0x40, R9
+	ADDQ    R10, R8
+	ADDQ    R10, SI
+	ADDQ    R10, DI
+	JMP     decodeBlockAsm_fast_copy_short_no_ol
+
+decodeBlockAsm_fast_copy3_read:
+	MOVL R9, R12
+	SHRL $0x05, R12
+	ANDL $0x3f, R12
+	SHRL $0x0b, R9
+	ADDL $0x00010000, R9
+	CMPL R12, $0x3d
+	JB   decodeBlockAsm_fast_copy_3_0_extra
+	JEQ  decodeBlockAsm_fast_copy_3_1_extra
+	CMPL R12, $0x3e
+	JEQ  decodeBlockAsm_fast_copy_3_2_extra
+	MOVL -1(R8), R12
+	ADDQ $0x03, R8
+	SHRL $0x08, R12
+	LEAL 64(R12), R12
+	JMP  decodeBlockAsm_fast_copy_fused_long
+
+decodeBlockAsm_fast_copy_3_2_extra:
+	MOVWLZX (R8), R12
+	ADDQ    $0x02, R8
+	LEAL    64(R12), R12
+	JMP     decodeBlockAsm_fast_copy_fused_long
+
+decodeBlockAsm_fast_copy_3_1_extra:
+	MOVBLZX (R8), R12
+	ADDQ    $0x01, R8
+	LEAL    64(R12), R12
+	JMP     decodeBlockAsm_fast_copy_fused_long
+
+decodeBlockAsm_fast_copy_3_0_extra:
+	LEAL 4(R12), R12
+	MOVL (R8), R11
+	MOVL R11, (SI)
+	ADDQ R10, R8
+	ADDQ R10, SI
+	ADDQ R10, DI
+	JMP  decodeBlockAsm_fast_copy_short_no_ol
+
+decodeBlockAsm_fast_copy_fused_long:
+	MOVL (R8), R11
+	MOVL R11, (SI)
+	ADDQ R10, R8
+	ADDQ R10, SI
+	ADDQ R10, DI
+	JMP  decodeBlockAsm_fast_copy_exec_long_long
+
+decodeBlockAsm_fast_copy_exec_short:
+	CMPL R9, DI
+	JA   corrupt
+	LEAQ (SI)(R12*1), R10
+	CMPQ R10, AX
+	JA   corrupt
+
+	// Prefetch next tag
+	MOVBQZX (R8), R10
+	MOVQ    SI, R11
+	SUBQ    R9, R11
+	CMPL    R9, R12
+	JB      decodeBlockAsm_fast_copy_overlap
+	JMP     decodeBlockAsm_fast_copy_short
+
+decodeBlockAsm_fast_copy_exec_long_long:
+	MOVQ SI, R11
+	SUBQ R9, R11
+	CMPL R9, DI
+	JA   corrupt
+	LEAQ (SI)(R12*1), R10
+	CMPQ R10, AX
+	JA   corrupt
+
+	// Prefetch next tag
+	MOVBQZX (R8), R10
+
+	// genMemMoveLong
+	MOVQ R12, R13
+	SHRQ $0x05, R13
+	MOVQ SI, R14
+	MOVQ R12, R15
+
+decodeBlockAsm_fast_copy_long_longlarge_big_loop_back:
+	MOVOU (R11), X0
+	MOVOU 16(R11), X1
+	MOVOU X0, (R14)
+	MOVOU X1, 16(R14)
+	ADDQ  $0x20, R14
+	ADDQ  $0x20, R11
+	SUBQ  $0x20, R15
+	DECQ  R13
+	JNZ   decodeBlockAsm_fast_copy_long_longlarge_big_loop_back
+	TESTQ R15, R15
+	JZ    decodeBlockAsm_fast_copy_done
+	MOVOU -32(R11)(R15*1), X0
+	MOVOU -16(R11)(R15*1), X1
+	MOVOU X0, -32(R14)(R15*1)
+	MOVOU X1, -16(R14)(R15*1)
+	JMP   decodeBlockAsm_fast_copy_done
+
+decodeBlockAsm_fast_copy_short_no_ol:
+	MOVQ SI, R11
+	SUBQ R9, R11
+	CMPL R9, DI
+	JA   corrupt
+	LEAQ (SI)(R12*1), R10
+	CMPQ R10, AX
+	JA   corrupt
+
+	// Prefetch next tag
+	MOVBQZX (R8), R10
+
+	// genMemMoveShort
+	// margin: 16, min move: 4
+	CMPQ R12, $0x10
+	JBE  decodeBlockAsm_fast_copy_short_no_ol_memmove_move_8through16
+	CMPQ R12, $0x20
+	JBE  decodeBlockAsm_fast_copy_short_no_ol_memmove_move_17through32
+	JMP  decodeBlockAsm_fast_copy_short_no_ol_memmove_move_33through64
+
+decodeBlockAsm_fast_copy_short_no_ol_memmove_move_8through16:
+	MOVOU (R11), X0
+	MOVOU X0, (SI)
+	JMP   decodeBlockAsm_fast_copy_done
+
+decodeBlockAsm_fast_copy_short_no_ol_memmove_move_17through32:
+	MOVOU (R11), X0
+	MOVOU -16(R11)(R12*1), X1
+	MOVOU X0, (SI)
+	MOVOU X1, -16(SI)(R12*1)
+	JMP   decodeBlockAsm_fast_copy_done
+
+decodeBlockAsm_fast_copy_short_no_ol_memmove_move_33through64:
+	MOVOU (R11), X0
+	MOVOU 16(R11), X1
+	MOVOU -32(R11)(R12*1), X2
+	MOVOU -16(R11)(R12*1), X3
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(R12*1)
+	MOVOU X3, -16(SI)(R12*1)
+	JMP   decodeBlockAsm_fast_copy_done
+
+decodeBlockAsm_fast_copy_exec:
+	CMPL R9, DI
+	JA   corrupt
+	LEAQ (SI)(R12*1), R10
+	CMPQ R10, AX
+	JA   corrupt
+	MOVQ SI, R11
+	SUBQ R9, R11
+
+	// Prefetch next tag
+	MOVBQZX (R8), R10
+	CMPL    R9, R12
+	JB      decodeBlockAsm_fast_copy_overlap
+	CMPL    R12, $0x40
+	JA      decodeBlockAsm_fast_copy_long
+
+decodeBlockAsm_fast_copy_short:
+	// genMemMoveShort
+	// margin: 16, min move: 1
+	CMPQ R12, $0x10
+	JBE  decodeBlockAsm_fast_copy_short_memmove_move_8through16
+	CMPQ R12, $0x20
+	JBE  decodeBlockAsm_fast_copy_short_memmove_move_17through32
+	JMP  decodeBlockAsm_fast_copy_short_memmove_move_33through64
+
+decodeBlockAsm_fast_copy_short_memmove_move_8through16:
+	MOVOU (R11), X0
+	MOVOU X0, (SI)
+	JMP   decodeBlockAsm_fast_copy_done
+
+decodeBlockAsm_fast_copy_short_memmove_move_17through32:
+	MOVOU (R11), X0
+	MOVOU -16(R11)(R12*1), X1
+	MOVOU X0, (SI)
+	MOVOU X1, -16(SI)(R12*1)
+	JMP   decodeBlockAsm_fast_copy_done
+
+decodeBlockAsm_fast_copy_short_memmove_move_33through64:
+	MOVOU (R11), X0
+	MOVOU 16(R11), X1
+	MOVOU -32(R11)(R12*1), X2
+	MOVOU -16(R11)(R12*1), X3
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(R12*1)
+	MOVOU X3, -16(SI)(R12*1)
+	JMP   decodeBlockAsm_fast_copy_done
+
+decodeBlockAsm_fast_copy_long:
+	// genMemMoveLong
+	MOVOU (R11), X0
+	MOVOU 16(R11), X1
+	MOVOU -32(R11)(R12*1), X2
+	MOVOU -16(R11)(R12*1), X3
+	MOVQ  R12, R14
+	SHRQ  $0x05, R14
+	MOVQ  SI, R13
+	ANDL  $0x0000001f, R13
+	MOVQ  $0x00000040, R15
+	SUBQ  R13, R15
+	DECQ  R14
+	JA    decodeBlockAsm_fast_copy_longlarge_forward_sse_loop_32
+	LEAQ  -32(R11)(R15*1), R13
+	LEAQ  -32(SI)(R15*1), BP
+
+decodeBlockAsm_fast_copy_longlarge_big_loop_back:
+	MOVOU (R13), X4
+	MOVOU 16(R13), X5
+	MOVOA X4, (BP)
+	MOVOA X5, 16(BP)
+	ADDQ  $0x20, BP
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R15
+	DECQ  R14
+	JNA   decodeBlockAsm_fast_copy_longlarge_big_loop_back
+
+decodeBlockAsm_fast_copy_longlarge_forward_sse_loop_32:
+	MOVOU -32(R11)(R15*1), X4
+	MOVOU -16(R11)(R15*1), X5
+	MOVOA X4, -32(SI)(R15*1)
+	MOVOA X5, -16(SI)(R15*1)
+	ADDQ  $0x20, R15
+	CMPQ  R12, R15
+	JAE   decodeBlockAsm_fast_copy_longlarge_forward_sse_loop_32
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(R12*1)
+	MOVOU X3, -16(SI)(R12*1)
+
+decodeBlockAsm_fast_copy_done:
+	ADDQ R12, SI
+	ADDQ R12, DI
+	MOVQ R10, R11
+	SHRQ $0x02, R11
+	CMPQ R8, DX
+	JB   decodeBlockAsm_fast_loop_nofetch
+	JMP  decodeBlockAsm_fast_end_copy
+
+decodeBlockAsm_fast_copy_overlap:
+	CMPL R9, $0x03
+	JA   decodeBlockAsm_fast_copy_overlap_4
+	JE   decodeBlockAsm_fast_copy_overlap_3
+	CMPL R9, $0x02
+	JE   decodeBlockAsm_fast_copy_overlap_2
+	MOVB (R11), R11
+	ADDQ R12, DI
+
+decodeBlockAsm_fast_loop_overlap_1:
+	MOVB R11, (SI)
+	INCQ SI
+	DECQ R12
+	JNZ  decodeBlockAsm_fast_loop_overlap_1
+	MOVQ R10, R11
+	SHRQ $0x02, R11
+	CMPQ R8, DX
+	JB   decodeBlockAsm_fast_loop_nofetch
+	JMP  decodeBlockAsm_fast_end_copy
+
+decodeBlockAsm_fast_copy_overlap_2:
+	MOVW (R11), R13
+	ADDQ R12, DI
+	BTL  $0x00, R12
+	JNC  decodeBlockAsm_fast_loop_overlap_2
+	MOVB R13, (SI)
+	MOVW 1(R11), R13
+	INCQ SI
+	DECQ R12
+
+decodeBlockAsm_fast_loop_overlap_2:
+	MOVW R13, (SI)
+	ADDQ $0x02, SI
+	SUBQ $0x02, R12
+	JNZ  decodeBlockAsm_fast_loop_overlap_2
+	MOVQ R10, R11
+	SHRQ $0x02, R11
+	CMPQ R8, DX
+	JB   decodeBlockAsm_fast_loop_nofetch
+	JMP  decodeBlockAsm_fast_end_copy
+
+decodeBlockAsm_fast_copy_overlap_3:
+	MOVL (R11), R13
+	ADDQ R12, DI
+	SUBQ $0x03, R12
+
+decodeBlockAsm_fast_loop_overlap_3:
+	MOVL R13, (SI)
+	ADDQ $0x03, SI
+	SUBQ $0x03, R12
+	JA   decodeBlockAsm_fast_loop_overlap_3
+	MOVW 3(R11)(R12*1), R13
+	MOVW R13, (SI)(R12*1)
+	MOVB 5(R11)(R12*1), R13
+	MOVB R13, 2(SI)(R12*1)
+	LEAQ 3(SI)(R12*1), SI
+	MOVQ R10, R11
+	SHRQ $0x02, R11
+	CMPQ R8, DX
+	JB   decodeBlockAsm_fast_loop_nofetch
+	JMP  decodeBlockAsm_fast_end_copy
+
+decodeBlockAsm_fast_copy_overlap_4:
+	ADDQ R12, DI
+	SUBQ $0x04, R12
+
+decodeBlockAsm_fast_loop_overlap_4:
+	MOVL (R11), R13
+	ADDQ $0x04, R11
+	MOVL R13, (SI)
+	ADDQ $0x04, SI
+	SUBQ $0x04, R12
+	JA   decodeBlockAsm_fast_loop_overlap_4
+	MOVL (R11)(R12*1), R13
+	MOVL R13, (SI)(R12*1)
+	LEAQ 4(SI)(R12*1), SI
+	MOVQ R10, R11
+	SHRQ $0x02, R11
+	CMPQ R8, DX
+	JB   decodeBlockAsm_fast_loop_nofetch
+
+decodeBlockAsm_fast_end_copy:
+decodeBlockAsm_fast_end_done:
+decodeBlockAsm_remain_loop:
+	CMPQ    R8, CX
+	JAE     decodeBlockAsm_remain_end_copy
+	MOVBQZX (R8), DX
+	MOVQ    DX, BX
+	SHRQ    $0x02, BX
+	CMPQ    SI, AX
+	JAE     decodeBlockAsm_remain_end_copy
+	ANDQ    $0x03, DX
+	JNZ     decodeBlockAsm_remain_copy
+
+decodeBlockAsm_remain_lits:
+	MOVL BX, DX
+	SHRL $0x01, DX
+	CMPL DX, $0x1d
+	JB   decodeBlockAsm_remain_lit_0
+	JEQ  decodeBlockAsm_remain_lit_1
+	CMPL DX, $0x1e
+	JEQ  decodeBlockAsm_remain_lit_2
+	JMP  decodeBlockAsm_remain_lit_3
+
+decodeBlockAsm_remain_lit_0:
+	INCQ R8
+	INCL DX
+	LEAQ (SI)(DX*1), R10
+	CMPQ R10, AX
+	JA   corrupt
+	BTL  $0x00, BX
+	JC   decodeBlockAsm_remain_copy_exec_short
+	LEAQ (R8)(DX*1), BX
+	CMPQ BX, CX
+	JA   corrupt
+
+	// genMemMoveShort
+	// margin: -1, min move: 1
+	CMPQ DX, $0x03
+	JB   decodeBlockAsm_remain_lit_0_copy_memmove_move_1or2
+	JE   decodeBlockAsm_remain_lit_0_copy_memmove_move_3
+	CMPQ DX, $0x08
+	JBE  decodeBlockAsm_remain_lit_0_copy_memmove_move_4through8
+	CMPQ DX, $0x10
+	JBE  decodeBlockAsm_remain_lit_0_copy_memmove_move_8through16
+	CMPQ DX, $0x20
+	JBE  decodeBlockAsm_remain_lit_0_copy_memmove_move_17through32
+	JMP  decodeBlockAsm_remain_lit_0_copy_memmove_move_33through64
+
+decodeBlockAsm_remain_lit_0_copy_memmove_move_1or2:
+	MOVB (R8), BL
+	MOVB -1(R8)(DX*1), R10
+	MOVB BL, (SI)
+	MOVB R10, -1(SI)(DX*1)
+	JMP  decodeBlockAsm_remain_litcopy_done
+
+decodeBlockAsm_remain_lit_0_copy_memmove_move_3:
+	MOVW (R8), BX
+	MOVB 2(R8), R10
+	MOVW BX, (SI)
+	MOVB R10, 2(SI)
+	JMP  decodeBlockAsm_remain_litcopy_done
+
+decodeBlockAsm_remain_lit_0_copy_memmove_move_4through8:
+	MOVL (R8), BX
+	MOVL -4(R8)(DX*1), R10
+	MOVL BX, (SI)
+	MOVL R10, -4(SI)(DX*1)
+	JMP  decodeBlockAsm_remain_litcopy_done
+
+decodeBlockAsm_remain_lit_0_copy_memmove_move_8through16:
+	MOVQ (R8), BX
+	MOVQ -8(R8)(DX*1), R10
+	MOVQ BX, (SI)
+	MOVQ R10, -8(SI)(DX*1)
+	JMP  decodeBlockAsm_remain_litcopy_done
+
+decodeBlockAsm_remain_lit_0_copy_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DX*1), X1
+	MOVOU X0, (SI)
+	MOVOU X1, -16(SI)(DX*1)
+	JMP   decodeBlockAsm_remain_litcopy_done
+
+decodeBlockAsm_remain_lit_0_copy_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DX*1), X2
+	MOVOU -16(R8)(DX*1), X3
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(DX*1)
+	MOVOU X3, -16(SI)(DX*1)
+	JMP   decodeBlockAsm_remain_litcopy_done
+
+decodeBlockAsm_remain_lit_1:
+	ADDQ    $0x02, R8
+	CMPQ    R8, CX
+	JA      corrupt
+	MOVBQZX -1(R8), DX
+	JMP     decodeBlockAsm_remain_litcopy_long
+
+decodeBlockAsm_remain_lit_2:
+	ADDQ    $0x03, R8
+	CMPQ    R8, CX
+	JA      corrupt
+	MOVWQZX -2(R8), DX
+	JMP     decodeBlockAsm_remain_litcopy_long
+
+decodeBlockAsm_remain_lit_3:
+	ADDQ $0x04, R8
+	CMPQ R8, CX
+	JA   corrupt
+	MOVL -4(R8), DX
+	SHRL $0x08, DX
+
+decodeBlockAsm_remain_litcopy_long:
+	LEAQ 30(DX), DX
+	LEAQ (SI)(DX*1), R10
+	CMPQ R10, AX
+	JA   corrupt
+	BTL  $0x00, BX
+	JC   decodeBlockAsm_remain_copy_exec
+	LEAQ (R8)(DX*1), BX
+	CMPQ BX, CX
+	JA   corrupt
+	CMPL DX, $0x40
+	JBE  decodeBlockAsm_remain_litcopy_short_reduced
+
+	// genMemMoveLong
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DX*1), X2
+	MOVOU -16(R8)(DX*1), X3
+	MOVQ  DX, R10
+	SHRQ  $0x05, R10
+	MOVQ  SI, BX
+	ANDL  $0x0000001f, BX
+	MOVQ  $0x00000040, R11
+	SUBQ  BX, R11
+	DECQ  R10
+	JA    decodeBlockAsm_remain_litcopy_longlarge_forward_sse_loop_32
+	LEAQ  -32(R8)(R11*1), BX
+	LEAQ  -32(SI)(R11*1), R12
+
+decodeBlockAsm_remain_litcopy_longlarge_big_loop_back:
+	MOVOU (BX), X4
+	MOVOU 16(BX), X5
+	MOVOA X4, (R12)
+	MOVOA X5, 16(R12)
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, BX
+	ADDQ  $0x20, R11
+	DECQ  R10
+	JNA   decodeBlockAsm_remain_litcopy_longlarge_big_loop_back
+
+decodeBlockAsm_remain_litcopy_longlarge_forward_sse_loop_32:
+	MOVOU -32(R8)(R11*1), X4
+	MOVOU -16(R8)(R11*1), X5
+	MOVOA X4, -32(SI)(R11*1)
+	MOVOA X5, -16(SI)(R11*1)
+	ADDQ  $0x20, R11
+	CMPQ  DX, R11
+	JAE   decodeBlockAsm_remain_litcopy_longlarge_forward_sse_loop_32
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(DX*1)
+	MOVOU X3, -16(SI)(DX*1)
+	JMP   decodeBlockAsm_remain_litcopy_done
+
+decodeBlockAsm_remain_litcopy_short_reduced:
+	// genMemMoveShort
+	// margin: -4, min move: 30
+	CMPQ DX, $0x20
+	JBE  decodeBlockAsm_remain_lit_longer_copy_memmove_move_17through32
+	JMP  decodeBlockAsm_remain_lit_longer_copy_memmove_move_33through64
+
+decodeBlockAsm_remain_lit_longer_copy_memmove_move_17through32:
+	MOVOU (R8), X0
+	MOVOU -16(R8)(DX*1), X1
+	MOVOU X0, (SI)
+	MOVOU X1, -16(SI)(DX*1)
+	JMP   decodeBlockAsm_remain_litcopy_done
+
+decodeBlockAsm_remain_lit_longer_copy_memmove_move_33through64:
+	MOVOU (R8), X0
+	MOVOU 16(R8), X1
+	MOVOU -32(R8)(DX*1), X2
+	MOVOU -16(R8)(DX*1), X3
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(DX*1)
+	MOVOU X3, -16(SI)(DX*1)
+
+decodeBlockAsm_remain_litcopy_done:
+	ADDQ    DX, R8
+	ADDQ    DX, SI
+	ADDQ    DX, DI
+	CMPQ    R8, CX
+	JAE     decodeBlockAsm_remain_end_done
+	MOVBQZX (R8), DX
+	MOVQ    DX, BX
+	SHRQ    $0x02, BX
+	CMPQ    SI, AX
+	JAE     decodeBlockAsm_remain_end_done
+	ANDQ    $0x03, DX
+	JZ      decodeBlockAsm_remain_lits
+
+decodeBlockAsm_remain_copy:
+	CMPL DX, $0x02
+	JB   decodeBlockAsm_remain_copy_1
+	JEQ  decodeBlockAsm_remain_copy_2
+	JMP  decodeBlockAsm_remain_copy_3
+
+decodeBlockAsm_remain_copy_1:
+	ADDQ    $0x02, R8
+	CMPQ    R8, CX
+	JA      corrupt
+	MOVWQZX -2(R8), R9
+	MOVQ    BX, DX
+	ANDL    $0x0f, DX
+	SHRL    $0x06, R9
+	INCL    R9
+	CMPL    DX, $0x0f
+	JNE     decodeBlockAsm_remain_copy_1_short
+	ADDQ    $0x01, R8
+	CMPQ    R8, CX
+	JA      corrupt
+	MOVBLZX -1(R8), DX
+	LEAL    18(DX), DX
+	JMP     decodeBlockAsm_remain_copy_exec
+
+decodeBlockAsm_remain_copy_1_short:
+	LEAL 4(DX), DX
+	JMP  decodeBlockAsm_remain_copy_exec_short
+
+decodeBlockAsm_remain_copy_2:
+	MOVQ    BX, DX
+	CMPL    BX, $0x3d
+	JB      decodeBlockAsm_remain_copy_2_0_extra
+	JEQ     decodeBlockAsm_remain_copy_2_1_extra
+	CMPL    DX, $0x3f
+	JB      decodeBlockAsm_remain_copy_2_2_extra
+	ADDQ    $0x06, R8
+	CMPQ    R8, CX
+	JA      corrupt
+	MOVWQZX -5(R8), R9
+	MOVL    -4(R8), DX
+	SHRL    $0x08, DX
+	LEAL    64(DX), DX
+	ADDQ    $0x40, R9
+	JMP     decodeBlockAsm_remain_copy_exec_long_long
+
+decodeBlockAsm_remain_copy_2_2_extra:
+	ADDQ    $0x05, R8
+	CMPQ    R8, CX
+	JA      corrupt
+	MOVWQZX -4(R8), R9
+	MOVWLZX -2(R8), DX
+	LEAL    64(DX), DX
+	ADDQ    $0x40, R9
+	JMP     decodeBlockAsm_remain_copy_exec_long_long
+
+decodeBlockAsm_remain_copy_2_1_extra:
+	ADDQ    $0x04, R8
+	CMPQ    R8, CX
+	JA      corrupt
+	MOVWQZX -3(R8), R9
+	MOVBLZX -1(R8), DX
+	LEAL    64(DX), DX
+	ADDQ    $0x40, R9
+	JMP     decodeBlockAsm_remain_copy_exec_long_long
+
+decodeBlockAsm_remain_copy_2_0_extra:
+	LEAQ    3(R8), R8
+	CMPQ    R8, CX
+	JA      corrupt
+	MOVWQZX -2(R8), R9
+	LEAL    4(DX), DX
+	ADDQ    $0x40, R9
+	JMP     decodeBlockAsm_remain_copy_short_no_ol
+
+decodeBlockAsm_remain_copy_3:
+	ADDQ    $0x04, R8
+	CMPQ    R8, CX
+	JA      corrupt
+	MOVL    -4(R8), R9
+	MOVQ    BX, R10
+	SHRQ    $0x01, R10
+	ANDQ    $0x03, R10
+	BTL     $0x00, BX
+	JC      decodeBlockAsm_remain_copy3_read
+	SHRL    $0x03, BX
+	ANDL    $0x07, BX
+	LEAL    4(BX), DX
+	MOVWQZX -3(R8), R9
+	DECQ    R8
+	INCQ    R10
+	LEAQ    (R8)(R10*1), BX
+	LEAQ    (SI)(R10*1), R11
+	CMPQ    BX, CX
+	JA      corrupt
+	CMPQ    R11, AX
+	JA      corrupt
+
+	// genMemMoveVeryShort
+	CMPQ R10, $0x03
+	JE   decodeBlockAsm_remain_copy2_fused_lits_move_3
+	JA   decodeBlockAsm_remain_copy2_fused_lits_move_4
+	MOVB (R8), BL
+	MOVB -1(R8)(R10*1), R11
+	MOVB BL, (SI)
+	MOVB R11, -1(SI)(R10*1)
+	JMP  decodeBlockAsm_remain_copy2_fused_lits_done
+
+decodeBlockAsm_remain_copy2_fused_lits_move_3:
+	MOVW (R8), BX
+	MOVB 2(R8), R11
+	MOVW BX, (SI)
+	MOVB R11, 2(SI)
+	JMP  decodeBlockAsm_remain_copy2_fused_lits_done
+
+decodeBlockAsm_remain_copy2_fused_lits_move_4:
+	MOVL (R8), BX
+	MOVL BX, (SI)
+
+decodeBlockAsm_remain_copy2_fused_lits_done:
+	ADDQ $0x40, R9
+	ADDQ R10, R8
+	ADDQ R10, SI
+	ADDQ R10, DI
+	JMP  decodeBlockAsm_remain_copy_short_no_ol
+
+decodeBlockAsm_remain_copy3_read:
+	MOVL R9, DX
+	SHRL $0x05, DX
+	ANDL $0x3f, DX
+	SHRL $0x0b, R9
+	ADDL $0x00010000, R9
+	CMPL DX, $0x3d
+	JB   decodeBlockAsm_remain_copy_3_0_extra
+	JEQ  decodeBlockAsm_remain_copy_3_1_extra
+	CMPL DX, $0x3e
+	JEQ  decodeBlockAsm_remain_copy_3_2_extra
+	ADDQ $0x03, R8
+	CMPQ R8, CX
+	JA   corrupt
+	MOVL -4(R8), DX
+	SHRL $0x08, DX
+	LEAL 64(DX), DX
+	JMP  decodeBlockAsm_remain_copy_fused_long
+
+decodeBlockAsm_remain_copy_3_2_extra:
+	ADDQ    $0x02, R8
+	CMPQ    R8, CX
+	JA      corrupt
+	MOVWLZX -2(R8), DX
+	LEAL    64(DX), DX
+	JMP     decodeBlockAsm_remain_copy_fused_long
+
+decodeBlockAsm_remain_copy_3_1_extra:
+	ADDQ    $0x01, R8
+	CMPQ    R8, CX
+	JA      corrupt
+	MOVBLZX -1(R8), DX
+	LEAL    64(DX), DX
+	JMP     decodeBlockAsm_remain_copy_fused_long
+
+decodeBlockAsm_remain_copy_3_0_extra:
+	LEAL  4(DX), DX
+	TESTL R10, R10
+	JZ    decodeBlockAsm_remain_copy_short_no_ol
+	LEAQ  (R8)(R10*1), BX
+	LEAQ  (SI)(R10*1), R11
+	CMPQ  BX, CX
+	JA    corrupt
+	CMPQ  R11, AX
+	JA    corrupt
+
+	// genMemMoveVeryShort
+	CMPQ R10, $0x03
+	JE   decodeBlockAsm_remain_copy3s_fused_lits_move_3
+	JA   decodeBlockAsm_remain_copy3s_fused_lits_move_4
+	MOVB (R8), BL
+	MOVB -1(R8)(R10*1), R11
+	MOVB BL, (SI)
+	MOVB R11, -1(SI)(R10*1)
+	JMP  decodeBlockAsm_remain_copy3s_fused_lits_done
+
+decodeBlockAsm_remain_copy3s_fused_lits_move_3:
+	MOVW (R8), BX
+	MOVB 2(R8), R11
+	MOVW BX, (SI)
+	MOVB R11, 2(SI)
+	JMP  decodeBlockAsm_remain_copy3s_fused_lits_done
+
+decodeBlockAsm_remain_copy3s_fused_lits_move_4:
+	MOVL (R8), BX
+	MOVL BX, (SI)
+
+decodeBlockAsm_remain_copy3s_fused_lits_done:
+	ADDQ R10, R8
+	ADDQ R10, SI
+	ADDQ R10, DI
+	JMP  decodeBlockAsm_remain_copy_short_no_ol
+
+decodeBlockAsm_remain_copy_fused_long:
+	TESTL R10, R10
+	JZ    decodeBlockAsm_remain_copy_exec_long_long
+	LEAQ  (R8)(R10*1), BX
+	LEAQ  (SI)(R10*1), R11
+	CMPQ  BX, CX
+	JA    corrupt
+	CMPQ  R11, AX
+	JA    corrupt
+
+	// genMemMoveVeryShort
+	CMPQ R10, $0x03
+	JE   decodeBlockAsm_remain_copy3_fused_lits_move_3
+	JA   decodeBlockAsm_remain_copy3_fused_lits_move_4
+	MOVB (R8), BL
+	MOVB -1(R8)(R10*1), R11
+	MOVB BL, (SI)
+	MOVB R11, -1(SI)(R10*1)
+	JMP  decodeBlockAsm_remain_copy3_fused_lits_done
+
+decodeBlockAsm_remain_copy3_fused_lits_move_3:
+	MOVW (R8), BX
+	MOVB 2(R8), R11
+	MOVW BX, (SI)
+	MOVB R11, 2(SI)
+	JMP  decodeBlockAsm_remain_copy3_fused_lits_done
+
+decodeBlockAsm_remain_copy3_fused_lits_move_4:
+	MOVL (R8), BX
+	MOVL BX, (SI)
+
+decodeBlockAsm_remain_copy3_fused_lits_done:
+	ADDQ R10, R8
+	ADDQ R10, SI
+	ADDQ R10, DI
+	JMP  decodeBlockAsm_remain_copy_exec_long_long
+
+decodeBlockAsm_remain_copy_exec_short:
+	CMPL R9, DI
+	JA   corrupt
+	LEAQ (SI)(DX*1), BX
+	CMPQ BX, AX
+	JA   corrupt
+	MOVQ SI, BX
+	SUBQ R9, BX
+	CMPL R9, DX
+	JB   decodeBlockAsm_remain_copy_overlap
+	JMP  decodeBlockAsm_remain_copy_short
+
+decodeBlockAsm_remain_copy_exec_long_long:
+	MOVQ SI, BX
+	SUBQ R9, BX
+	CMPL R9, DI
+	JA   corrupt
+	LEAQ (SI)(DX*1), R10
+	CMPQ R10, AX
+	JA   corrupt
+
+	// genMemMoveLong
+	MOVQ DX, R10
+	SHRQ $0x05, R10
+	MOVQ SI, R11
+	MOVQ DX, R12
+
+decodeBlockAsm_remain_copy_long_longlarge_big_loop_back:
+	MOVOU (BX), X0
+	MOVOU 16(BX), X1
+	MOVOU X0, (R11)
+	MOVOU X1, 16(R11)
+	ADDQ  $0x20, R11
+	ADDQ  $0x20, BX
+	SUBQ  $0x20, R12
+	DECQ  R10
+	JNZ   decodeBlockAsm_remain_copy_long_longlarge_big_loop_back
+	TESTQ R12, R12
+	JZ    decodeBlockAsm_remain_copy_done
+	MOVOU -32(BX)(R12*1), X0
+	MOVOU -16(BX)(R12*1), X1
+	MOVOU X0, -32(R11)(R12*1)
+	MOVOU X1, -16(R11)(R12*1)
+	JMP   decodeBlockAsm_remain_copy_done
+
+decodeBlockAsm_remain_copy_short_no_ol:
+	MOVQ SI, BX
+	SUBQ R9, BX
+	CMPL R9, DI
+	JA   corrupt
+	LEAQ (SI)(DX*1), R10
+	CMPQ R10, AX
+	JA   corrupt
+
+	// genMemMoveShort
+	// margin: -4, min move: 4
+	CMPQ DX, $0x08
+	JBE  decodeBlockAsm_remain_copy_short_no_ol_memmove_move_4through8
+	CMPQ DX, $0x10
+	JBE  decodeBlockAsm_remain_copy_short_no_ol_memmove_move_8through16
+	CMPQ DX, $0x20
+	JBE  decodeBlockAsm_remain_copy_short_no_ol_memmove_move_17through32
+	JMP  decodeBlockAsm_remain_copy_short_no_ol_memmove_move_33through64
+
+decodeBlockAsm_remain_copy_short_no_ol_memmove_move_4through8:
+	MOVL (BX), R10
+	MOVL -4(BX)(DX*1), BX
+	MOVL R10, (SI)
+	MOVL BX, -4(SI)(DX*1)
+	JMP  decodeBlockAsm_remain_copy_done
+
+decodeBlockAsm_remain_copy_short_no_ol_memmove_move_8through16:
+	MOVQ (BX), R10
+	MOVQ -8(BX)(DX*1), BX
+	MOVQ R10, (SI)
+	MOVQ BX, -8(SI)(DX*1)
+	JMP  decodeBlockAsm_remain_copy_done
+
+decodeBlockAsm_remain_copy_short_no_ol_memmove_move_17through32:
+	MOVOU (BX), X0
+	MOVOU -16(BX)(DX*1), X1
+	MOVOU X0, (SI)
+	MOVOU X1, -16(SI)(DX*1)
+	JMP   decodeBlockAsm_remain_copy_done
+
+decodeBlockAsm_remain_copy_short_no_ol_memmove_move_33through64:
+	MOVOU (BX), X0
+	MOVOU 16(BX), X1
+	MOVOU -32(BX)(DX*1), X2
+	MOVOU -16(BX)(DX*1), X3
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(DX*1)
+	MOVOU X3, -16(SI)(DX*1)
+	JMP   decodeBlockAsm_remain_copy_done
+
+decodeBlockAsm_remain_copy_exec:
+	CMPL R9, DI
+	JA   corrupt
+	LEAQ (SI)(DX*1), BX
+	CMPQ BX, AX
+	JA   corrupt
+	MOVQ SI, BX
+	SUBQ R9, BX
+	CMPL R9, DX
+	JB   decodeBlockAsm_remain_copy_overlap
+	CMPL DX, $0x40
+	JA   decodeBlockAsm_remain_copy_long
+
+decodeBlockAsm_remain_copy_short:
+	// genMemMoveShort
+	// margin: -4, min move: 1
+	CMPQ DX, $0x03
+	JB   decodeBlockAsm_remain_copy_short_memmove_move_1or2
+	JE   decodeBlockAsm_remain_copy_short_memmove_move_3
+	CMPQ DX, $0x08
+	JBE  decodeBlockAsm_remain_copy_short_memmove_move_4through8
+	CMPQ DX, $0x10
+	JBE  decodeBlockAsm_remain_copy_short_memmove_move_8through16
+	CMPQ DX, $0x20
+	JBE  decodeBlockAsm_remain_copy_short_memmove_move_17through32
+	JMP  decodeBlockAsm_remain_copy_short_memmove_move_33through64
+
+decodeBlockAsm_remain_copy_short_memmove_move_1or2:
+	MOVB (BX), R10
+	MOVB -1(BX)(DX*1), BL
+	MOVB R10, (SI)
+	MOVB BL, -1(SI)(DX*1)
+	JMP  decodeBlockAsm_remain_copy_done
+
+decodeBlockAsm_remain_copy_short_memmove_move_3:
+	MOVW (BX), R10
+	MOVB 2(BX), BL
+	MOVW R10, (SI)
+	MOVB BL, 2(SI)
+	JMP  decodeBlockAsm_remain_copy_done
+
+decodeBlockAsm_remain_copy_short_memmove_move_4through8:
+	MOVL (BX), R10
+	MOVL -4(BX)(DX*1), BX
+	MOVL R10, (SI)
+	MOVL BX, -4(SI)(DX*1)
+	JMP  decodeBlockAsm_remain_copy_done
+
+decodeBlockAsm_remain_copy_short_memmove_move_8through16:
+	MOVQ (BX), R10
+	MOVQ -8(BX)(DX*1), BX
+	MOVQ R10, (SI)
+	MOVQ BX, -8(SI)(DX*1)
+	JMP  decodeBlockAsm_remain_copy_done
+
+decodeBlockAsm_remain_copy_short_memmove_move_17through32:
+	MOVOU (BX), X0
+	MOVOU -16(BX)(DX*1), X1
+	MOVOU X0, (SI)
+	MOVOU X1, -16(SI)(DX*1)
+	JMP   decodeBlockAsm_remain_copy_done
+
+decodeBlockAsm_remain_copy_short_memmove_move_33through64:
+	MOVOU (BX), X0
+	MOVOU 16(BX), X1
+	MOVOU -32(BX)(DX*1), X2
+	MOVOU -16(BX)(DX*1), X3
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(DX*1)
+	MOVOU X3, -16(SI)(DX*1)
+	JMP   decodeBlockAsm_remain_copy_done
+
+decodeBlockAsm_remain_copy_long:
+	// genMemMoveLong
+	MOVOU (BX), X0
+	MOVOU 16(BX), X1
+	MOVOU -32(BX)(DX*1), X2
+	MOVOU -16(BX)(DX*1), X3
+	MOVQ  DX, R11
+	SHRQ  $0x05, R11
+	MOVQ  SI, R10
+	ANDL  $0x0000001f, R10
+	MOVQ  $0x00000040, R12
+	SUBQ  R10, R12
+	DECQ  R11
+	JA    decodeBlockAsm_remain_copy_longlarge_forward_sse_loop_32
+	LEAQ  -32(BX)(R12*1), R10
+	LEAQ  -32(SI)(R12*1), R13
+
+decodeBlockAsm_remain_copy_longlarge_big_loop_back:
+	MOVOU (R10), X4
+	MOVOU 16(R10), X5
+	MOVOA X4, (R13)
+	MOVOA X5, 16(R13)
+	ADDQ  $0x20, R13
+	ADDQ  $0x20, R10
+	ADDQ  $0x20, R12
+	DECQ  R11
+	JNA   decodeBlockAsm_remain_copy_longlarge_big_loop_back
+
+decodeBlockAsm_remain_copy_longlarge_forward_sse_loop_32:
+	MOVOU -32(BX)(R12*1), X4
+	MOVOU -16(BX)(R12*1), X5
+	MOVOA X4, -32(SI)(R12*1)
+	MOVOA X5, -16(SI)(R12*1)
+	ADDQ  $0x20, R12
+	CMPQ  DX, R12
+	JAE   decodeBlockAsm_remain_copy_longlarge_forward_sse_loop_32
+	MOVOU X0, (SI)
+	MOVOU X1, 16(SI)
+	MOVOU X2, -32(SI)(DX*1)
+	MOVOU X3, -16(SI)(DX*1)
+
+decodeBlockAsm_remain_copy_done:
+	ADDQ DX, SI
+	ADDQ DX, DI
+	JMP  decodeBlockAsm_remain_loop
+
+decodeBlockAsm_remain_copy_overlap:
+	ADDQ DX, DI
+
+decodeBlockAsm_remain_copy_overlap_simple:
+	MOVB (BX), R10
+	MOVB R10, (SI)
+	INCQ BX
+	INCQ SI
+	DECQ DX
+	JNZ  decodeBlockAsm_remain_copy_overlap_simple
+	JMP  decodeBlockAsm_remain_loop
+
+decodeBlockAsm_remain_end_copy:
+decodeBlockAsm_remain_end_done:
+	MOVQ src_base+24(FP), AX
+	MOVQ src_len+32(FP), CX
+	MOVQ dst_base+0(FP), DX
+	MOVQ dst_len+8(FP), BX
+	LEAQ (DX)(BX*1), DX
+	LEAQ (AX)(CX*1), AX
+	CMPQ SI, DX
+	JNE  corrupt
+	CMPQ R8, AX
+	JNE  corrupt
+	MOVQ $0x00000000, ret+48(FP)
+	RET
+
+corrupt:
+	MOVQ $0x00000001, ret+48(FP)
+	RET
diff --git a/vendor/github.com/minio/minlz/asm_none.go b/vendor/github.com/minio/minlz/asm_none.go
new file mode 100644
index 0000000000..0943444f77
--- /dev/null
+++ b/vendor/github.com/minio/minlz/asm_none.go
@@ -0,0 +1,326 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !amd64 || appengine || !gc || noasm || purego
+
+package minlz
+
+import (
+	"fmt"
+	"math/bits"
+)
+
+const hasAsm = false
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlock(dst, src []byte) (d int) {
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	return encodeBlockGo(dst, src)
+}
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src))
+func encodeBlockBetter(dst, src []byte) (d int) {
+	return encodeBlockBetterGo(dst, src)
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
+func emitLiteral(dst, lit []byte) int {
+	// 0-28: Length 1 -> 29
+	// 29: Length (Read 1) + 1
+	// 30: Length (Read 2) + 1
+	// 31: Length (Read 3) + 1
+	if len(lit) == 0 {
+		return 0
+	}
+	if debugEncode {
+		fmt.Println("(literal)", len(lit))
+	}
+	i, n := 0, uint(len(lit)-1)
+
+	switch {
+	case n < 29:
+		store8(dst, 0, uint8(n)<<3|tagLiteral)
+		i = 1
+	case n < 1<<8+29:
+		store8(dst, 1, uint8(n-29))
+		store8(dst, 0, 29<<3|tagLiteral)
+		i = 2
+	case n < 1<<16+29:
+		n -= 29
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 30<<3 | tagLiteral
+		i = 3
+	case n < 1<<24+29:
+		n -= 29
+		dst[3] = uint8(n >> 16)
+		dst[2] = uint8(n >> 8)
+		dst[1] = uint8(n)
+		dst[0] = 31<<3 | tagLiteral
+		i = 4
+	default:
+		panic("literal block too long")
+	}
+	return i + copy(dst[i:], lit)
+}
+
+// emitRepeat writes a repeat chunk and returns the number of bytes written.
+func emitRepeat(dst []byte, length int) int {
+	// Repeat offset, make length cheaper
+	if debugEncode {
+		fmt.Println("(repeat)", length)
+	}
+
+	if debugEncode && length < 0 {
+		panic(fmt.Sprintf("invalid length %d", length))
+	}
+	if length < 30 {
+		store8(dst, 0, uint8(length-1)<<3|tagRepeat)
+		return 1
+	}
+	length -= 30
+	if length < 256 {
+		store8(dst, 1, uint8(length>>0))
+		store8(dst, 0, 29<<3|tagRepeat)
+		return 2
+	}
+
+	if length < 65536 {
+		dst[2] = uint8(length >> 8)
+		dst[1] = uint8(length >> 0)
+		dst[0] = 30<<3 | tagRepeat
+		return 3
+	}
+	dst[3] = uint8(length >> 16)
+	dst[2] = uint8(length >> 8)
+	dst[1] = uint8(length >> 0)
+	dst[0] = 31<<3 | tagRepeat
+	return 4
+}
+
+// encodeCopy3 encodes a copy operation with 24 bit offset.
+// length must be at least 1 and < 1<<24
+func encodeCopy3(dst []byte, offset, length, lits int) int {
+	// Repeat offset, make length cheaper
+	length -= 4
+	if debugEncode && length < 0 {
+		panic(fmt.Sprintf("invalid length %d", length))
+	}
+	if debugEncode && offset < 65536 {
+		panic(fmt.Sprintf("invalid offset %d", offset))
+	}
+
+	// Encode offset
+	var encoded uint32
+	encoded = uint32(offset-65536)<<11 | tagCopy3 | uint32(lits<<3)
+
+	if length <= 60 {
+		encoded |= uint32(length << 5)
+		store32(dst, 0, encoded)
+		return 4
+	}
+	length -= 60
+	if length < 256 {
+		store8(dst, 4, uint8(length>>0))
+		encoded |= 61 << 5
+		store32(dst, 0, encoded)
+		return 5
+	}
+
+	if length < 65536 {
+		encoded |= 62 << 5
+		dst[5] = uint8(length >> 8)
+		dst[4] = uint8(length >> 0)
+		store32(dst, 0, encoded)
+		return 6
+	}
+	encoded |= 63 << 5
+	dst[6] = uint8(length >> 16)
+	dst[5] = uint8(length >> 8)
+	dst[4] = uint8(length >> 0)
+	store32(dst, 0, encoded)
+	return 7
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//
+//	dst is long enough to hold the encoded bytes
+func emitCopy(dst []byte, offset, length int) int {
+	if debugEncode {
+		fmt.Println("(copy) length:", length, "offset:", offset)
+		if offset == 0 || offset > maxCopy3Offset {
+			panic(fmt.Sprintf("(emitCopy) invalid offset %d", offset))
+		}
+	}
+
+	if offset > maxCopy2Offset {
+		// return encodeCopy3(dst, offset, length, 0) expanded...
+		// Repeat offset, make length cheaper
+		length -= 4
+		if debugEncode && length < 0 {
+			panic(fmt.Sprintf("invalid length %d", length))
+		}
+		if debugEncode && offset < 65536 {
+			panic(fmt.Sprintf("invalid offset %d", offset))
+		}
+
+		// Encode offset
+		var encoded uint32
+		encoded = uint32(offset-65536)<<11 | tagCopy3
+
+		if length <= 60 {
+			encoded |= uint32(length << 5)
+			store32(dst, 0, encoded)
+			return 4
+		}
+		length -= 60
+		if length < 256 {
+			dst[4] = uint8(length >> 0)
+			encoded |= 61 << 5
+			store32(dst, 0, encoded)
+			return 5
+		}
+
+		if length < 65536 {
+			encoded |= 62 << 5
+			store16(dst[:], 4, uint16(length))
+			store32(dst, 0, encoded)
+			return 6
+		}
+		encoded |= 63 << 5
+		dst[6] = uint8(length >> 16)
+		dst[5] = uint8(length >> 8)
+		dst[4] = uint8(length >> 0)
+		store32(dst, 0, encoded)
+		return 7
+	}
+
+	// Small offset. Use copy1
+	if offset <= maxCopy1Offset {
+		offset--
+		if length < 15+4 {
+			x := uint16(offset<<6) | uint16(length-4)<<2 | tagCopy1
+			store16(dst, 0, x)
+			return 2
+		}
+		if length < 256+18 {
+			x := uint16(offset<<6) | (uint16(15)<<2 | tagCopy1)
+			store16(dst, 0, x)
+			dst[2] = uint8(length - 18)
+			return 3
+		}
+		// Encode as Copy1 and repeat
+		x := uint16(offset<<6) | uint16(14)<<2 | tagCopy1
+		store16(dst, 0, x)
+		return 2 + emitRepeat(dst[2:], length-18)
+	}
+
+	return encodeCopy2(dst, offset, length)
+}
+
+// emitCopyLits2 emit 2 byte offset copy with literals.
+// len(lits) must be 1 - 4.
+// The caller should only call when the offset can contain a literal encoding.
+// Longer copies are emitted as copy+repeat.
+func emitCopyLits2(dst, lits []byte, offset, length int) int {
+	if debugEncode {
+		if offset < minCopy2Offset || offset > maxCopy2Offset {
+			panic(fmt.Sprintf("invalid offset %d", offset))
+		}
+		if len(lits) > maxCopy2Lits {
+			panic(fmt.Sprintf("invalid literal count %d", len(lits)))
+		}
+		fmt.Println("(copy2) lits:", len(lits), "length:", length, "offset:", offset)
+	}
+	offset -= minCopy2Offset
+	// Emit as literal + 2 byte offset code.
+	// If longer than 11 use repeat for remaining.
+	length -= 4
+	const copy2LitMaxLenRaw = copy2LitMaxLen - 4
+	if length > copy2LitMaxLenRaw {
+		store16(dst, 1, uint16(offset))
+		store8(dst, 0, tagCopy2Fused|uint8((copy2LitMaxLenRaw)<<5)|uint8(len(lits)-1)<<3)
+		n := copy(dst[3:], lits) + 3
+		return n + emitRepeat(dst[n:], length-copy2LitMaxLenRaw)
+	}
+	store16(dst, 1, uint16(offset))
+	store8(dst, 0, tagCopy2Fused|uint8(length<<5)|uint8(len(lits)-1)<<3)
+	return copy(dst[3:], lits) + 3
+}
+
+// emitCopyLits3 emit a 3 byte offset copy with literals.
+// len(lits) must be 1 - 3.
+// The caller should only call when the offset can contain a literal encoding.
+func emitCopyLits3(dst, lits []byte, offset, length int) int {
+	if debugEncode {
+		fmt.Println("(copy3) lits:", len(lits), "length:", length, "offset:", offset)
+		if offset > maxCopy3Offset {
+			panic(fmt.Sprintf("(emitCopyLits3) invalid offset %d", offset))
+		}
+	}
+	n := encodeCopy3(dst, offset, length, len(lits))
+	copy(dst[n:], lits)
+	return n + len(lits)
+}
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+//
+//	len(a) <= len(b)
+func matchLen(a []byte, b []byte) int {
+	b = b[:len(a)]
+	var checked int
+	for len(a)-checked >= 8 {
+		if diff := load64(a, checked) ^ load64(b, checked); diff != 0 {
+			return checked + (bits.TrailingZeros64(diff) >> 3)
+		}
+		checked += 8
+	}
+	a = a[checked:]
+	b = b[checked:]
+	b = b[:len(a)]
+	for i := range a {
+		if a[i] != b[i] {
+			return int(i) + checked
+		}
+	}
+	return len(a) + checked
+}
+
+// cvtLZ4Block converts an LZ4 block to MinLZ
+func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
+	panic("not implemented")
+}
diff --git a/vendor/github.com/minio/minlz/decode.go b/vendor/github.com/minio/minlz/decode.go
new file mode 100644
index 0000000000..730ea6b23a
--- /dev/null
+++ b/vendor/github.com/minio/minlz/decode.go
@@ -0,0 +1,622 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+
+	"github.com/klauspost/compress/s2"
+)
+
+const (
+	decodeErrCodeCorrupt = 1
+)
+
+var (
+	// ErrCorrupt reports that the input is invalid.
+	ErrCorrupt = errors.New("minlz: corrupt input")
+	// ErrCRC reports that the input failed CRC validation (streams only)
+	ErrCRC = errors.New("minlz: corrupt input, crc mismatch")
+	// ErrTooLarge reports that the uncompressed length is too large.
+	ErrTooLarge = errors.New("minlz: decoded block is too large")
+	// ErrUnsupported reports that the input isn't supported.
+	ErrUnsupported = errors.New("minlz: unsupported input")
+	// ErrInvalidLevel is returned when an invalid compression level is requested.
+	ErrInvalidLevel = errors.New("minlz: invalid compression level")
+)
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// This decoder has automatic fallback to Snappy/S2.
+// To reject fallback check with IsMinLZ.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func Decode(dst, src []byte) ([]byte, error) {
+	isMLZ, lits, block, dLen, err := isMinLZ(src)
+	if err != nil {
+		return nil, err
+	}
+	if lits {
+		return append(dst[:0], block...), nil
+	}
+
+	if !isMLZ {
+		if l, _ := s2.DecodedLen(block); l > MaxBlockSize {
+			return nil, ErrTooLarge
+		}
+		if dst, err := s2.Decode(dst, block); err != nil {
+			return nil, ErrCorrupt
+		} else {
+			return dst, nil
+		}
+	}
+	if dLen <= cap(dst) {
+		dst = dst[:dLen]
+	} else {
+		dst = make([]byte, dLen, dLen)
+	}
+	if minLZDecode(dst, block) != 0 {
+		return dst, ErrCorrupt
+	}
+	return dst, nil
+}
+
+// AppendDecoded will append the decoded version of src to dst.
+// If the decoded content cannot fit within dst, it will cause an allocation.
+// This decoder has automatic fallback to Snappy/S2.
+// To reject fallback check with IsMinLZ.
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func AppendDecoded(dst, src []byte) ([]byte, error) {
+	dLen, err := DecodedLen(src)
+	if err != nil {
+		return dst, err
+	}
+	if dLen > cap(dst)-len(dst) {
+		oLen := len(dst)
+		dst = append(dst, make([]byte, dLen)...)
+		dst = dst[:oLen]
+	}
+	d, err := Decode(dst[len(dst):], src)
+	if err != nil {
+		return dst, err
+	}
+	if len(d) != dLen {
+		return dst, ErrCorrupt
+	}
+	return dst[:len(dst)+dLen], nil
+}
+
+// DecodedLen returns the length of the decoded block.
+// This length will never be exceeded when decoding a block.
+func DecodedLen(src []byte) (int, error) {
+	_, _, _, v, err := isMinLZ(src)
+	return v, err
+}
+
+// IsMinLZ returns whether the block is a minlz block
+// and returns the size of the decompressed block.
+func IsMinLZ(src []byte) (ok bool, size int, err error) {
+	ok, _, _, size, err = isMinLZ(src)
+	return
+}
+
+// IsMinLZ returns true if the block is a minlz block.
+func isMinLZ(src []byte) (ok, literals bool, block []byte, size int, err error) {
+	if len(src) <= 1 {
+		if len(src) == 0 {
+			err = ErrCorrupt
+			return
+		}
+		if src[0] == 0 {
+			// Size 0 block. Could be MinLZ.
+			return true, true, src[1:], 0, nil
+		}
+	}
+	if src[0] != 0 {
+		// Older - Snappy or S2...
+		v, _, err := decodedLen(src)
+		return false, false, src, v, err
+	}
+	src = src[1:]
+	v, headerLen, err := decodedLen(src)
+	if err != nil {
+		return false, false, nil, 0, err
+	}
+	if v > MaxBlockSize {
+		return false, false, nil, 0, ErrTooLarge
+	}
+	src = src[headerLen:]
+	if len(src) == 0 {
+		return false, false, nil, 0, ErrCorrupt
+	}
+	if v == 0 {
+		// Literals, rest of block...
+		return true, true, src, len(src), nil
+	}
+	if v < len(src) {
+		return false, false, src, v, ErrCorrupt
+	}
+	return true, false, src, v, nil
+}
+
+// decodedLen returns the length of the decoded block and the number of bytes
+// that the length header occupied.
+func decodedLen(src []byte) (blockLen, headerLen int, err error) {
+	v, n := binary.Uvarint(src)
+	if n <= 0 || v > 0xffffffff {
+		return 0, 0, ErrCorrupt
+	}
+
+	const wordSize = 32 << (^uint(0) >> 32 & 1)
+	if wordSize == 32 && v > 0x7fffffff {
+		return 0, 0, ErrTooLarge
+	}
+	return int(v), n, nil
+}
+
+// minLZDecode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func minLZDecodeGo(dst, src []byte) int {
+	const debug = debugDecode
+	const debugErrors = false || debug
+	if debug {
+		fmt.Println("Starting decode, src:", len(src), "dst len:", len(dst))
+	}
+	var d, s, length int
+	offset := 1
+
+	// As long as we can read at least 11 bytes... (longest code possible +4 lits)
+	for s < len(src)-11 {
+		// Maximum input needed.
+		if debug {
+			//fmt.Printf("in:%x, tag: %02b va:%x - src: %d, dst: %d\n", src[s], src[s]&3, src[s]>>2, s, d)
+		}
+
+		switch load8(src, s) & 0x03 {
+		case tagLiteral:
+			v := load8(src, s)
+			x := v >> 3
+			switch {
+			case x < 29:
+				s++
+				length = int(x) + 1
+			case x == 29: // 1 byte length
+				length = 30 + int(load8(src, s+1))
+				s += 2
+			case x == 30: // 2 byte length
+				length = 30 + int(load16(src, s+1))
+				s += 3
+			default:
+				// case x == 31: // 3 byte length
+				// Load as 32 bit and shift down.
+				length = 30 + int(load32(src, s)>>8)
+				s += 4
+			}
+			if v&4 != 0 {
+				// repeat
+				if debug {
+					fmt.Print(d, ": (repeat)")
+				}
+				goto docopy
+			}
+			if length > len(dst)-d || length > len(src)-s {
+				if debugErrors {
+					fmt.Println("corrupt: lit size", length, "dst avail:", len(dst)-d, "src avail:", len(src)-s, "dst pos:", d)
+				}
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Print(d, ": (literals) length: ", length, " d-after: ", d+length, "\n")
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			if debug {
+				fmt.Print(d, ": (copy1) ")
+			}
+			length = int(load8(src, s)) >> 2 & 15
+			offset = int(load16(src, s)>>6) + 1
+			if length == 15 {
+				// Extended length with 1 byte
+				length = int(load8(src, s+2)) + 18
+				s += 3
+			} else {
+				length += 4
+				s += 2
+			}
+		case tagCopy2:
+			if debug {
+				fmt.Print(d, ": (copy2)")
+			}
+			length = int(load8(src, s)) >> 2
+			offset = int(load16(src, s+1))
+			if length <= 60 {
+				length += 4
+				s += 3
+			} else {
+				switch length {
+				case 61: // 1 byte + 64
+					length = int(load8(src, s+3)) + 64
+					s += 4
+				case 62: // 2 bytes + 64
+					length = int(load16(src, s+3)) + 64
+					s += 5
+				case 63: // 3 bytes + 64
+					// Load as 32 bit and shift down.
+					length = int(load32(src, s+2)>>8) + 64
+					s += 6
+				}
+			}
+			offset += minCopy2Offset
+		case 0x3:
+			val := load32(src, s)
+			isCopy3 := val&4 != 0
+			litLen := int(val>>3) & 3
+			if !isCopy3 {
+				if debug {
+					fmt.Print(d, ": (copy2 fused) ")
+				}
+				length = 4 + int(val>>5)&7
+				offset = int(val>>8)&65535 + minCopy2Offset
+				s += 3
+				litLen++
+			} else {
+				lengthTmp := (val >> 5) & 63
+				offset = int(val>>11) + minCopy3Offset
+				if debug {
+					fmt.Print(d, ": (copy3)")
+				}
+				switch {
+				case lengthTmp < 61:
+					length = int(lengthTmp) + 4
+					s += 4
+				case lengthTmp == 61:
+					length = int(load8(src, s+4)) + 64
+					s += 5
+				case lengthTmp == 62:
+					length = int(load16(src, s+4)) + 64
+					s += 6
+				case lengthTmp == 63:
+					length = int(load32(src, s+3)>>8) + 64
+					s += 7
+				default:
+					panic("unreachable")
+				}
+			}
+			if litLen > 0 {
+				if debug {
+					fmt.Print(" - lits: ", litLen, " ")
+				}
+				if len(dst)-d < 4 {
+					if debugErrors {
+						fmt.Println("corrupt: lit size", length+litLen, "dst avail:", len(dst)-d, "src avail:", len(src)-s, "dst pos:", d)
+					}
+					return decodeErrCodeCorrupt
+				}
+				// We will always have room to read
+				store32(dst, d, load32(src, s))
+				s += litLen
+				d += litLen
+			}
+		}
+	docopy:
+		if d < offset || length > len(dst)-d {
+			if debugErrors {
+				fmt.Println("corrupt: match, length", length, "offset:", offset, "dst avail:", len(dst)-d, "dst pos:", d)
+			}
+			return decodeErrCodeCorrupt
+		}
+
+		if debug {
+			fmt.Println("- copy, length:", length, "offset:", offset, "d-after:", d+length, "s-after:", s)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+
+	// Remaining with extra checks...
+	for s < len(src) {
+		if debug {
+			//fmt.Printf("in:%x, tag: %02b va:%x - src: %d, dst: %d\n", src[s], src[s]&3, src[s]>>2, s, d)
+		}
+		switch load8(src, s) & 0x03 {
+		case tagLiteral:
+			v := load8(src, s)
+			x := v >> 3
+			switch {
+			case x < 29:
+				s++
+				length = int(x + 1)
+			case x == 29:
+				s += 2
+				if s > len(src) {
+					if debugErrors {
+						fmt.Println("(1)read out of bounds, src pos:", s, "dst pos:", d)
+					}
+					return decodeErrCodeCorrupt
+				}
+				length = int(uint32(src[s-1]) + 30)
+			case x == 30:
+				s += 3
+				if s > len(src) {
+					if debugErrors {
+						fmt.Println("(2)read out of bounds, src pos:", s, "dst pos:", d)
+					}
+					return decodeErrCodeCorrupt
+				}
+				length = int(uint32(src[s-2]) | uint32(src[s-1])<<8 + 30)
+			default:
+				//			case x == 31:
+				s += 4
+				if s > len(src) {
+					if debugErrors {
+						fmt.Println("(3)read out of bounds, src pos:", s, "dst pos:", d)
+					}
+					return decodeErrCodeCorrupt
+				}
+				length = int(uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 + 30)
+			}
+
+			if v&4 != 0 {
+				// repeat
+				if debug {
+					fmt.Print(d, ": (repeat)")
+				}
+				goto doCopy2
+			}
+
+			if length > len(dst)-d || length > len(src)-s {
+				if debugErrors {
+					fmt.Println("corrupt: lit size", length, "dst avail:", len(dst)-d, "src avail:", len(src)-s, "dst pos:", d)
+				}
+				return decodeErrCodeCorrupt
+			}
+			if debug {
+				fmt.Print(d, ": (literals), length: ", length, " d-after: ", d+length)
+			}
+
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+
+			if debug {
+				fmt.Println("")
+			}
+			continue
+
+		case tagCopy1:
+			if debug {
+				fmt.Print(d, ": (copy1 -wut?) ")
+			}
+			s += 2
+			if s > len(src) {
+				if debugErrors {
+					fmt.Println("(5-1)read out of bounds, src pos:", s, "dst pos:", d)
+				}
+				return decodeErrCodeCorrupt
+			}
+
+			length = int(src[s-2]) >> 2 & 15
+			offset = int(load16(src, s-2)>>6) + 1
+			if length == 15 {
+				s++
+				if s > len(src) {
+					if debugErrors {
+						fmt.Println("(5)read out of bounds, src pos:", s, "dst pos:", d)
+					}
+					return decodeErrCodeCorrupt
+				}
+				length = int(src[s-1]) + 18
+			} else {
+				length += 4
+			}
+		case tagCopy2:
+			if debug {
+				fmt.Print(d, ": (copy2) ")
+			}
+			s += 3
+			if uint(s) > uint(len(src)) {
+				if debugErrors {
+					fmt.Println("(7)read out of bounds, src pos:", s, "dst pos:", d)
+				}
+				return decodeErrCodeCorrupt
+			}
+			length = int(src[s-3]) >> 2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+			if length <= 60 {
+				length += 4
+			} else {
+				switch length {
+				case 61:
+					s++
+					if uint(s) > uint(len(src)) {
+						if debugErrors {
+							fmt.Println("(8)read out of bounds, src pos:", s, "dst pos:", d)
+						}
+						return decodeErrCodeCorrupt
+					}
+					length = int(src[s-1]) + 64
+				case 62:
+					s += 2
+					if uint(s) > uint(len(src)) {
+						if debugErrors {
+							fmt.Println("(9)read out of bounds, src pos:", s, "dst pos:", d)
+						}
+						return decodeErrCodeCorrupt
+					}
+					length = int(src[s-2]) | int(src[s-1])<<8 + 64
+				case 63:
+					s += 3
+					if s > len(src) {
+						if debugErrors {
+							fmt.Println("(10)read out of bounds, src pos:", s, "dst pos:", d)
+						}
+						return decodeErrCodeCorrupt
+					}
+					length = int(src[s-3]) | int(src[s-2])<<8 | int(src[s-1])<<16 + 64
+				}
+			}
+			offset += minCopy2Offset
+		case 0x3:
+			s += 4
+			if s > len(src) {
+				if debugErrors {
+					fmt.Println("(11)read out of bounds, src pos:", s, "dst pos:", d)
+				}
+				return decodeErrCodeCorrupt
+			}
+			val := load32(src, s-4)
+			isCopy3 := val&4 != 0
+			litLen := int(val>>3) & 3
+			if !isCopy3 {
+				if debug {
+					fmt.Print(d, ": (copy2 fused) ")
+				}
+				length = 4 + int(val>>5)&7
+				offset = int(val>>8)&65535 + minCopy2Offset
+				s--
+				litLen++
+			} else {
+				if debug {
+					fmt.Print(d, ": (copy3) ")
+				}
+				lengthTmp := (val >> 5) & 63
+				offset = int(val>>11) + minCopy3Offset
+				if lengthTmp >= 61 {
+					switch lengthTmp {
+					case 61:
+						s++
+						if s > len(src) {
+							if debugErrors {
+								fmt.Println("(13)read out of bounds, src pos:", s, "dst pos:", d)
+							}
+							return decodeErrCodeCorrupt
+						}
+						length = int(src[s-1]) + 64
+					case 62:
+						s += 2
+						if s > len(src) {
+							if debugErrors {
+								fmt.Println("(14)read out of bounds, src pos:", s, "dst pos:", d)
+							}
+							return decodeErrCodeCorrupt
+						}
+						length = (int(src[s-2]) | int(src[s-1])<<8) + 64
+					default:
+						s += 3
+						if s > len(src) {
+							if debugErrors {
+								fmt.Println("(15)read out of bounds, src pos:", s, "dst pos:", d)
+							}
+							return decodeErrCodeCorrupt
+						}
+						length = int(src[s-3]) | int(src[s-2])<<8 | int(src[s-1])<<16 + 64
+					}
+				} else {
+					length = int(lengthTmp + 4)
+				}
+			}
+
+			if litLen > 0 {
+				if litLen > len(dst)-d || s+litLen > len(src) {
+					if debugErrors {
+						fmt.Println("corrupt: lits size", litLen, "dst avail:", len(dst)-d, "src avail:", len(src)-s)
+					}
+					return decodeErrCodeCorrupt
+				}
+				copy(dst[d:], src[s:s+litLen])
+				d += litLen
+				s += litLen
+			}
+		}
+
+	doCopy2:
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			if debugErrors {
+				fmt.Println("corrupt: match, length", length, "offset:", offset, "dst avail:", len(dst)-d, "dst pos:", d)
+			}
+			return decodeErrCodeCorrupt
+		}
+
+		if debug {
+			fmt.Println(" - copy, length:", length, "offset:", offset, "d-after:", d+length, "s-after:", s)
+		}
+
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+	if debug {
+		fmt.Println("Done, d:", d, "s:", s, len(dst))
+	}
+	if d != len(dst) {
+		if debugErrors {
+			fmt.Println("corrupt: dst len:", len(dst), "d:", d)
+		}
+		return decodeErrCodeCorrupt
+	}
+	return 0
+}
diff --git a/vendor/github.com/minio/minlz/decode_amd64.go b/vendor/github.com/minio/minlz/decode_amd64.go
new file mode 100644
index 0000000000..ae9d6424c2
--- /dev/null
+++ b/vendor/github.com/minio/minlz/decode_amd64.go
@@ -0,0 +1,36 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build amd64 && !(appengine || !gc || noasm || purego)
+
+package minlz
+
+import (
+	"github.com/minio/minlz/internal/race"
+)
+
+// minLZDecode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func minLZDecode(dst, src []byte) int {
+	if dst == nil {
+		panic("nil dst")
+	}
+
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+	return decodeBlockAsm(dst, src)
+}
diff --git a/vendor/github.com/minio/minlz/decode_other.go b/vendor/github.com/minio/minlz/decode_other.go
new file mode 100644
index 0000000000..4d4f556abd
--- /dev/null
+++ b/vendor/github.com/minio/minlz/decode_other.go
@@ -0,0 +1,26 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !amd64 || appengine || !gc || noasm || purego
+
+package minlz
+
+// minLZDecode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func minLZDecode(dst, src []byte) int {
+	return minLZDecodeGo(dst, src)
+}
diff --git a/vendor/github.com/minio/minlz/dict.go b/vendor/github.com/minio/minlz/dict.go
new file mode 100644
index 0000000000..f1cfb4a097
--- /dev/null
+++ b/vendor/github.com/minio/minlz/dict.go
@@ -0,0 +1,286 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"sync"
+)
+
+const (
+	// minDictSize is the minimum dictionary size when repeat has been read.
+	minDictSize = 16
+
+	// maxDictSize is the maximum dictionary size when repeat has been read.
+	maxDictSize = 65536
+
+	// maxDictSrcOffset is the maximum offset where a dictionary entry can start.
+	maxDictSrcOffset = 65535
+)
+
+// dict contains a dictionary that can be used for encoding and decoding s2
+type dict struct {
+	dict   []byte
+	repeat int // Repeat as index of dict
+
+	fast, better, best sync.Once
+	fastTable          *[1 << 14]uint16
+
+	betterTableShort *[1 << 14]uint16
+	betterTableLong  *[1 << 17]uint16
+
+	bestTableShort *[1 << 16]uint32
+	bestTableLong  *[1 << 19]uint32
+}
+
+/*
+// NewDict will read a dictionary.
+// It will return nil if the dictionary is invalid.
+func NewDict(dct []byte) *dict {
+	if len(dct) == 0 {
+		return nil
+	}
+	var d dict
+	// Repeat is the first value of the dict
+	r, n := binary.Uvarint(dct)
+	if n <= 0 {
+		return nil
+	}
+	dct = dct[n:]
+	d.dict = dct
+	if cap(d.dict) < len(d.dict)+16 {
+		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
+	}
+	if len(dct) < minDictSize || len(dct) > maxDictSize {
+		return nil
+	}
+	d.repeat = int(r)
+	if d.repeat > len(dct) {
+		return nil
+	}
+	return &d
+}
+
+// Bytes will return a serialized version of the dictionary.
+// The output can be sent to NewDict.
+func (d *dict) Bytes() []byte {
+	dst := make([]byte, binary.MaxVarintLen16+len(d.dict))
+	return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...)
+}
+
+// makeDict will create a dictionary.
+// 'data' must be at least minDictSize.
+// If data is longer than maxDictSize only the last maxDictSize bytes will be used.
+// If searchStart is set the start repeat value will be set to the last
+// match of this content.
+// If no matches are found, it will attempt to find shorter matches.
+// This content should match the typical start of a block.
+// If at least 4 bytes cannot be matched, repeat is set to start of block.
+func makeDict(data []byte, searchStart []byte) *dict {
+	if len(data) == 0 {
+		return nil
+	}
+	if len(data) > maxDictSize {
+		data = data[len(data)-maxDictSize:]
+	}
+	var d dict
+	dict := data
+	d.dict = dict
+	if cap(d.dict) < len(d.dict)+16 {
+		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
+	}
+	if len(dict) < minDictSize {
+		return nil
+	}
+
+	// Find the longest match possible, last entry if multiple.
+	for s := len(searchStart); s > 4; s-- {
+		if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 {
+			d.repeat = idx
+			break
+		}
+	}
+
+	return &d
+}
+
+// makeDictManual will create a dictionary.
+// 'data' must be at least minDictSize and less than or equal to maxDictSize.
+// A manual first repeat index into data must be provided.
+// It must be less than len(data)-8.
+func makeDictManual(data []byte, firstIdx uint16) *dict {
+	if len(data) < minDictSize || int(firstIdx) >= len(data)-8 || len(data) > maxDictSize {
+		return nil
+	}
+	var d dict
+	dict := data
+	d.dict = dict
+	if cap(d.dict) < len(d.dict)+16 {
+		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
+	}
+
+	d.repeat = int(firstIdx)
+	return &d
+}
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func (d *dict) Encode(dst, src []byte, level int) ([]byte, error) {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	dstP := binary.PutUvarint(dst, uint64(len(src)))
+
+	if len(src) == 0 {
+		return dst[:dstP], nil
+	}
+	if len(src) < minNonLiteralBlockSize {
+		dstP += emitLiteral(dst[dstP:], src)
+		return dst[:dstP], nil
+	}
+	var n int
+	switch level {
+	case LevelFastest:
+		n = encodeBlockDictGo(dst[dstP:], src, d)
+	case LevelBalanced:
+		n = encodeBlockBetterDict(dst[dstP:], src, d)
+	case LevelSmallest:
+		n = encodeBlockBest(dst[dstP:], src, d)
+	default:
+		return nil, ErrInvalidLevel
+	}
+	if n > 0 {
+		dstP += n
+		return dst[:dstP], nil
+	}
+	// Not compressible
+	dstP += emitLiteral(dst[dstP:], src)
+	return dst[:dstP], nil
+}
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func (d *dict) Decode(dst, src []byte) ([]byte, error) {
+	dLen, s, err := decodedLen(src)
+	if err != nil {
+		return nil, err
+	}
+	if dLen <= cap(dst) {
+		dst = dst[:dLen]
+	} else {
+		dst = make([]byte, dLen)
+	}
+	if minLZDecodeDict(dst, src[s:], d) != 0 {
+		return nil, ErrCorrupt
+	}
+	return dst, nil
+}
+
+func (d *dict) initFast() {
+	d.fast.Do(func() {
+		const (
+			tableBits    = 14
+			maxTableSize = 1 << tableBits
+		)
+
+		var table [maxTableSize]uint16
+		// We stop so any entry of length 8 can always be read.
+		for i := 0; i < len(d.dict)-8-2; i += 3 {
+			x0 := load64(d.dict, i)
+			h0 := hash6(x0, tableBits)
+			h1 := hash6(x0>>8, tableBits)
+			h2 := hash6(x0>>16, tableBits)
+			table[h0] = uint16(i)
+			table[h1] = uint16(i + 1)
+			table[h2] = uint16(i + 2)
+		}
+		d.fastTable = &table
+	})
+}
+
+func (d *dict) initBetter() {
+	d.better.Do(func() {
+		const (
+			// Long hash matches.
+			lTableBits    = 17
+			maxLTableSize = 1 << lTableBits
+
+			// Short hash matches.
+			sTableBits    = 14
+			maxSTableSize = 1 << sTableBits
+		)
+
+		var lTable [maxLTableSize]uint16
+		var sTable [maxSTableSize]uint16
+
+		// We stop so any entry of length 8 can always be read.
+		for i := 0; i < len(d.dict)-8; i++ {
+			cv := load64(d.dict, i)
+			lTable[hash7(cv, lTableBits)] = uint16(i)
+			sTable[hash4(cv, sTableBits)] = uint16(i)
+		}
+		d.betterTableShort = &sTable
+		d.betterTableLong = &lTable
+	})
+}
+
+func (d *dict) initBest() {
+	d.best.Do(func() {
+		const (
+			// Long hash matches.
+			lTableBits    = 19
+			maxLTableSize = 1 << lTableBits
+
+			// Short hash matches.
+			sTableBits    = 16
+			maxSTableSize = 1 << sTableBits
+		)
+
+		var lTable [maxLTableSize]uint32
+		var sTable [maxSTableSize]uint32
+
+		// We stop so any entry of length 8 can always be read.
+		for i := 0; i < len(d.dict)-8; i++ {
+			cv := load64(d.dict, i)
+			hashL := hash8(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL := lTable[hashL]
+			candidateS := sTable[hashS]
+			lTable[hashL] = uint32(i) | candidateL<<16
+			sTable[hashS] = uint32(i) | candidateS<<16
+		}
+		d.bestTableShort = &sTable
+		d.bestTableLong = &lTable
+	})
+}
+*/
diff --git a/vendor/github.com/minio/minlz/encode.go b/vendor/github.com/minio/minlz/encode.go
new file mode 100644
index 0000000000..d2e5a6060b
--- /dev/null
+++ b/vendor/github.com/minio/minlz/encode.go
@@ -0,0 +1,277 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
+	"os"
+)
+
+const (
+	// LevelFastest is the fastest compression level.
+	LevelFastest = 1
+
+	// LevelBalanced is the balanced compression level.
+	// This is targeted to be approximately half the speed of LevelFastest.
+	LevelBalanced = 2
+
+	// LevelSmallest will attempt the best possible compression.
+	// There is no speed target for this level.
+	LevelSmallest = 3
+
+	// Internal use only
+	copyLitBits = 2
+
+	maxCopy1Offset = 1024
+
+	minCopy2Offset = 64
+	maxCopy2Offset = minCopy2Offset + 65535 // 2MiB
+	copy2LitMaxLen = 7 + 4                  // max length
+	maxCopy2Lits   = 1 << copyLitBits
+	minCopy2Length = 64
+
+	maxCopy3Lits   = 1<<copyLitBits - 1
+	minCopy3Offset = 65536
+	maxCopy3Offset = 2<<20 + 65535 // 2MiB
+	minCopy3Length = 64
+)
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// The blocks will require the same amount of memory to decode as encoding,
+// and does not make for concurrent decoding.
+// Also note that blocks do not contain CRC information, so corruption may be undetected.
+//
+// If you need to encode larger amounts of data, consider using
+// the streaming interface which gives all of these features.
+func Encode(dst, src []byte, level int) ([]byte, error) {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		return nil, ErrTooLarge
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	if len(src) < minNonLiteralBlockSize {
+		return encodeUncompressed(dst[:0], src), nil
+	}
+
+	dst[0] = 0
+	d := 1
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d += binary.PutUvarint(dst[d:], uint64(len(src)))
+
+	var n int
+	switch level {
+	case LevelFastest:
+		n = encodeBlock(dst[d:], src)
+	case LevelBalanced:
+		n = encodeBlockBetter(dst[d:], src)
+	case LevelSmallest:
+		n = encodeBlockBest(dst[d:], src, nil)
+	default:
+		return nil, ErrInvalidLevel
+	}
+
+	if n > 0 {
+		if debugValidateBlocks {
+			block := dst[d : d+n]
+			dst := make([]byte, len(src), len(src))
+			ret := minLZDecode(dst, block)
+			if !bytes.Equal(dst, src) {
+				n := matchLen(dst, src)
+				x := crc32.ChecksumIEEE(src)
+				name := fmt.Sprintf("errs/block-%08x", x)
+				fmt.Println(name, "mismatch at pos", n)
+				os.WriteFile(name+"input.bin", src, 0644)
+				os.WriteFile(name+"decoded.bin", dst, 0644)
+				os.WriteFile(name+"compressed.bin", block, 0644)
+			}
+			if ret != 0 {
+				panic("decode error")
+			}
+		}
+		d += n
+		return dst[:d], nil
+	}
+	// Not compressible, emit as uncompressed.
+	return encodeUncompressed(dst[:0], src), nil
+}
+
+// AppendEncoded will append the encoded version of src to dst.
+// If dst has MaxEncodedLen(len(src)) capacity left it will be done without allocation.
+// See Encode for more information.
+func AppendEncoded(dst, src []byte, level int) ([]byte, error) {
+	wantDst := MaxEncodedLen(len(src))
+	if wantDst < 0 {
+		return nil, ErrTooLarge
+	}
+	d := len(dst)
+	if cap(dst) < wantDst+d {
+		dst = append(dst, make([]byte, wantDst)...)
+	}
+	dst = dst[:d+wantDst]
+	res, err := Encode(dst[d:], src, level)
+	if err != nil {
+		return nil, err
+	}
+	if len(res) > wantDst {
+		panic("overflow")
+	}
+	return dst[:d+len(res)], nil
+}
+
+// TryEncode returns the encoded form of src, if compressible.
+// The same limitations apply as the Encode function.
+// If the block is incompressible or another error occurs,
+// nil will be returned.
+func TryEncode(dst, src []byte, level int) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		return nil
+	} else if cap(dst) < n {
+		dst = make([]byte, n)
+	} else {
+		dst = dst[:n]
+	}
+
+	if len(src) < minNonLiteralBlockSize {
+		return nil
+	}
+
+	dst[0] = 0
+	d := 1
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d += binary.PutUvarint(dst[d:], uint64(len(src)))
+
+	var n int
+	switch level {
+	case LevelFastest:
+		n = encodeBlock(dst[d:], src)
+	case LevelBalanced:
+		n = encodeBlockBetter(dst[d:], src)
+	case LevelSmallest:
+		n = encodeBlockBest(dst[d:], src, nil)
+	default:
+		return nil
+	}
+
+	if n > 0 && d+n < len(src) {
+		d += n
+		return dst[:d]
+	}
+	// Not compressible
+	return nil
+}
+
+// inputMargin is the minimum number of extra input bytes to keep, inside
+// encodeBlock's inner loop. On some architectures, this margin lets us
+// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
+// literals can be implemented as a single load to and store from a 16-byte
+// register. That literal's actual length can be as short as 1 byte, so this
+// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
+// the encoding loop will fix up the copy overrun, and this inputMargin ensures
+// that we don't overrun the dst and src buffers.
+const inputMargin = 8
+
+// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
+// will be accepted by the encoder.
+const minNonLiteralBlockSize = 16
+
+// encodeUncompressed will append src to dst as uncompressed data and return it.
+func encodeUncompressed(dst, src []byte) []byte {
+	if len(src) == 0 {
+		return append(dst, 0)
+	}
+	return append(append(dst, 0, 0), src...)
+}
+
+// MaxEncodedLen returns the maximum length of a snappy block, given its
+// uncompressed length.
+//
+// It will return a negative value if srcLen is too large to encode.
+func MaxEncodedLen(srcLen int) int {
+	n := uint64(srcLen)
+	if n > MaxBlockSize {
+		return -1
+	}
+	if srcLen == 0 {
+		return 1
+	}
+	// Maximum overhead is 2 bytes.
+	return int(n + 2)
+}
+
+// encodeCopy2 encodes a length and returns the number of bytes written.
+func encodeCopy2(dst []byte, offset, length int) int {
+	// Repeat offset, make length cheaper
+	length -= 4
+	offset -= minCopy2Offset
+	if debugEncode {
+		if length < 0 {
+			panic(fmt.Sprintf("invalid length %d", length))
+		}
+		if offset < 0 {
+			panic(fmt.Sprintf("invalid offset %d", offset))
+		}
+	}
+	store16(dst, 1, uint16(offset))
+	if length <= 60 {
+		store8(dst, 0, uint8(length)<<2|tagCopy2)
+		return 3
+	}
+	length -= 60
+	if length < 256 {
+		store8(dst, 3, uint8(length>>0))
+		store8(dst, 0, 61<<2|tagCopy2)
+		return 4
+	}
+
+	if length < 65536 {
+		dst[4] = uint8(length >> 8)
+		dst[3] = uint8(length >> 0)
+		dst[0] = 62<<2 | tagCopy2
+		return 5
+	}
+	dst[5] = uint8(length >> 16)
+	dst[4] = uint8(length >> 8)
+	dst[3] = uint8(length >> 0)
+	dst[0] = 63<<2 | tagCopy2
+	return 6
+}
+
+// emitLiteralSizeN returns the overhead of emitting n literal.
+func emitLiteralSizeN(n int) int {
+	if n == 0 {
+		return 0
+	}
+	switch {
+	case n <= 29:
+		return 1
+	case n < 29+(1<<8):
+		return 2
+	case n < 29+(1<<16):
+		return 3
+	default:
+		return 4
+	}
+}
diff --git a/vendor/github.com/minio/minlz/encode_amd64.go b/vendor/github.com/minio/minlz/encode_amd64.go
new file mode 100644
index 0000000000..951e3d191e
--- /dev/null
+++ b/vendor/github.com/minio/minlz/encode_amd64.go
@@ -0,0 +1,189 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !appengine && !noasm && gc && !purego
+
+package minlz
+
+import (
+	"sync"
+
+	"github.com/minio/minlz/internal/race"
+)
+
+const hasAsm = true
+
+var encPools [7]sync.Pool
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+
+	switch {
+	case len(src) > 2<<20:
+		const sz, pool = 131072, 0
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeBlockAsm(dst, src, tmp)
+	case len(src) > 512<<10:
+		const sz, pool = 131072, 0
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeBlockAsm2MB(dst, src, tmp)
+	case len(src) > 64<<10:
+		const sz, pool = 65536, 2
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeBlockAsm512K(dst, src, tmp)
+	case len(src) > 16<<10:
+		const sz, pool = 16384, 3
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeBlockAsm64K(dst, src, tmp)
+	case len(src) > 4<<10:
+		const sz, pool = 8192, 4
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeBlockAsm16K(dst, src, tmp)
+	case len(src) > 1<<10:
+		const sz, pool = 2048, 5
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeBlockAsm4K(dst, src, tmp)
+	case len(src) > minNonLiteralBlockSize:
+		const sz, pool = 1024, 6
+		tmp, ok := encPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encPools[pool].Put(tmp)
+		return encodeBlockAsm1K(dst, src, tmp)
+	}
+	return 0
+}
+
+var encBetterPools [6]sync.Pool
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetter(dst, src []byte) (d int) {
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+
+	switch {
+	case len(src) > 2<<20:
+		const sz, pool = 589824, 0
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+		return encodeBetterBlockAsm(dst, src, tmp)
+	case len(src) > 512<<10:
+		const sz, pool = 589824, 0
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+		return encodeBetterBlockAsm2MB(dst, src, tmp)
+	case len(src) > 64<<10:
+		const sz, pool = 294912, 1
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+		return encodeBetterBlockAsm512K(dst, src, tmp)
+	case len(src) > 16<<10:
+		const sz, pool = 73728, 2
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+		return encodeBetterBlockAsm64K(dst, src, tmp)
+	case len(src) > 4<<10:
+		const sz, pool = 36864, 3
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+		return encodeBetterBlockAsm16K(dst, src, tmp)
+	case len(src) > 1<<10:
+		const sz, pool = 10240, 4
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+		return encodeBetterBlockAsm4K(dst, src, tmp)
+	case len(src) > minNonLiteralBlockSize:
+		const sz, pool = 4608, 5
+		tmp, ok := encBetterPools[pool].Get().(*[sz]byte)
+		if !ok {
+			tmp = &[sz]byte{}
+		}
+		race.WriteSlice(tmp[:])
+		defer encBetterPools[pool].Put(tmp)
+		return encodeBetterBlockAsm1K(dst, src, tmp)
+	}
+	return 0
+}
diff --git a/vendor/github.com/minio/minlz/encode_l1.go b/vendor/github.com/minio/minlz/encode_l1.go
new file mode 100644
index 0000000000..268b2e93b9
--- /dev/null
+++ b/vendor/github.com/minio/minlz/encode_l1.go
@@ -0,0 +1,526 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"bytes"
+	"encoding/hex"
+	"fmt"
+	"math/bits"
+)
+
+// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash6(u uint64, h uint8) uint32 {
+	const prime6bytes = 227718039650203
+	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
+}
+
+// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockGo(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 15
+		maxTableSize = 1 << tableBits
+		skipLog      = 6
+
+		debug = debugEncode
+	)
+	if len(src) <= 65536 {
+		return encodeBlockGo64K(dst, src)
+	}
+	// Having values inside the table is ~the same speed as looking up
+	// - maybe slightly faster on bigger blocks.
+	// We go for the smaller stack allocation for now.
+	var table [maxTableSize]uint32
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+	if debugEncode {
+		fmt.Println("encodeBlockGo: Starting encode")
+	}
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>skipLog + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			minSrcPos := s - maxCopy3Offset
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint32(s)
+			table[hash1] = uint32(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			// Speed impact is very small.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+				if debugEncode {
+					fmt.Println(nextEmit, "(lits) length:", base-nextEmit, "d-after:", d)
+				}
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				if debug {
+					// Validate match.
+					if s <= candidate {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := src[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+				d += emitRepeat(dst[d:], s-base)
+				if debugEncode {
+					fmt.Println(base, "(repeat) length:", s-base, "offset:", repeat, "d-after:", d)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if candidate >= minSrcPos && uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if candidate2 >= minSrcPos && uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint32(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint32(s + 2)
+			if candidate >= minSrcPos && uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards.
+		// The top bytes will be rechecked to get the full match.
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes match.
+		base := s
+		repeat = base - candidate
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidate += 4
+		for s <= len(src)-8 {
+			if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidate += 8
+		}
+		length := s - base
+		if nextEmit != base {
+			if base-nextEmit > maxCopy3Lits || repeat < minCopy2Offset {
+				// Bail if we exceed the maximum size.
+				// We will not exceed dstLimit with the other encodings.
+				if d+(s-nextEmit) > dstLimit {
+					return 0
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+				d += emitCopy(dst[d:], repeat, length)
+			} else if repeat <= maxCopy2Offset {
+				d += emitCopyLits2(dst[d:], src[nextEmit:base], repeat, length)
+			} else {
+				d += emitCopyLits3(dst[d:], src[nextEmit:base], repeat, length)
+			}
+		} else {
+			d += emitCopy(dst[d:], repeat, length)
+		}
+		if debugEncode {
+			fmt.Println(base, "(copy) length:", s-base, "offset:", repeat, "d-after:", d)
+		}
+		if debug {
+			// Validate match.
+			if s <= candidate {
+				panic("s <= candidate")
+			}
+			a := src[base:s]
+			b := src[base-repeat : base-repeat+(s-base)]
+			if !bytes.Equal(a, b) {
+				panic(fmt.Sprintf("mismatch: source: %v != target: %v", hex.EncodeToString(a), hex.EncodeToString(b)))
+			}
+		}
+
+		for {
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			x := load64(src, s-2)
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			m2Hash := hash6(x, tableBits)
+			x = x >> 16
+			currHash := hash6(x, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint32(s - 2)
+			table[currHash] = uint32(s)
+			if debug && s == candidate {
+				panic("s == candidate")
+			}
+			if s-candidate > maxCopy3Offset || uint32(x) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+
+			repeat = s - candidate
+			base = s
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+			d += emitCopy(dst[d:], repeat, s-base)
+			if debugEncode {
+				fmt.Println(base, "(copy) length:", s-base, "offset:", repeat, "d-after:", d)
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		if debugEncode {
+			fmt.Println(nextEmit, "emit remainder", len(src)-nextEmit, "d:", d)
+		}
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			if debugEncode {
+				fmt.Println("emit remainder", d+len(src)-nextEmit, " exceeds limit", dstLimit)
+			}
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+func encodeBlockGo64K(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 13
+		maxTableSize = 1 << tableBits
+		skipLog      = 5
+		debug        = debugEncode
+	)
+	// Having values inside the table is ~the same speed as looking up
+	// - maybe slightly faster on bigger blocks.
+	// We go for the smaller stack allocation for now.
+	var table [maxTableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+	if debugEncode {
+		fmt.Println("encodeBlockGo: Starting encode")
+	}
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>skipLog + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash5(cv, tableBits)
+			hash1 := hash5(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint16(s)
+			table[hash1] = uint16(s + 1)
+			hash2 := hash5(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			// Speed impact is very small.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+				if debugEncode {
+					fmt.Println(nextEmit, "(lits) length:", base-nextEmit, "d-after:", d)
+				}
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				if debug {
+					// Validate match.
+					if s <= candidate {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := src[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+				d += emitRepeat(dst[d:], s-base)
+				if debugEncode {
+					fmt.Println(base, "(repeat) length:", s-base, "offset:", repeat, "d-after:", d)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint16(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint16(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards.
+		// The top bytes will be rechecked to get the full match.
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes match.
+		base := s
+		repeat = base - candidate
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidate += 4
+		for s <= len(src)-8 {
+			if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidate += 8
+		}
+		length := s - base
+		if nextEmit != base {
+			if base-nextEmit > maxCopy2Lits || repeat < minCopy2Offset {
+				// Bail if we exceed the maximum size.
+				// We will not exceed dstLimit with the other encodings.
+				if d+(s-nextEmit) > dstLimit {
+					return 0
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+				d += emitCopy(dst[d:], repeat, length)
+			} else {
+				d += emitCopyLits2(dst[d:], src[nextEmit:base], repeat, length)
+			}
+		} else {
+			d += emitCopy(dst[d:], repeat, length)
+		}
+		if debugEncode {
+			fmt.Println(base, "(copy) length:", s-base, "offset:", repeat, "d-after:", d)
+		}
+		if debug {
+			// Validate match.
+			if s <= candidate {
+				panic("s <= candidate")
+			}
+			a := src[base:s]
+			b := src[base-repeat : base-repeat+(s-base)]
+			if !bytes.Equal(a, b) {
+				panic(fmt.Sprintf("mismatch: source: %v != target: %v", hex.EncodeToString(a), hex.EncodeToString(b)))
+			}
+		}
+
+		for {
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			x := load64(src, s-2)
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			m2Hash := hash5(x, tableBits)
+			x = x >> 16
+			currHash := hash5(x, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint16(s - 2)
+			table[currHash] = uint16(s)
+			if debug && s == candidate {
+				panic("s == candidate")
+			}
+			if uint32(x) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+
+			repeat = s - candidate
+			base = s
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+			d += emitCopy(dst[d:], repeat, s-base)
+			if debugEncode {
+				fmt.Println(base, "(copy) length:", s-base, "offset:", repeat, "d-after:", d)
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		if debugEncode {
+			fmt.Println(nextEmit, "emit remainder", len(src)-nextEmit, "d:", d)
+		}
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			if debugEncode {
+				fmt.Println("emit remainder", d+len(src)-nextEmit, " exceeds limit", dstLimit)
+			}
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
diff --git a/vendor/github.com/minio/minlz/encode_l2.go b/vendor/github.com/minio/minlz/encode_l2.go
new file mode 100644
index 0000000000..6c7bd66df3
--- /dev/null
+++ b/vendor/github.com/minio/minlz/encode_l2.go
@@ -0,0 +1,1230 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"fmt"
+	"math/bits"
+	"sync"
+)
+
+// hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4(u uint64, h uint8) uint32 {
+	const prime4bytes = 2654435761
+	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
+}
+
+// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash5(u uint64, h uint8) uint32 {
+	const prime5bytes = 889523592379
+	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
+}
+
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+	const prime7bytes = 58295818150454627
+	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
+}
+
+// hash8 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash8(u uint64, h uint8) uint32 {
+	const prime8bytes = 0xcf1bbcdcb7a56463
+	return uint32((u * prime8bytes) >> ((64 - h) & 63))
+}
+
+var encLPool sync.Pool
+
+// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterGo(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	if len(src) <= 64<<10 {
+		return encodeBlockBetterGo64K(dst, src)
+	}
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 17
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 14
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable *[maxLTableSize]uint32
+	if t := encLPool.Get(); t != nil {
+		lTable = t.(*[maxLTableSize]uint32)
+		*lTable = [maxLTableSize]uint32{}
+	} else {
+		lTable = new([maxLTableSize]uint32)
+	}
+	defer encLPool.Put(lTable)
+	var sTable [maxSTableSize]uint32
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := 1
+
+	if debugEncode {
+		fmt.Println("encodeBlockBetterGo: Starting encode")
+	}
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>7 + 1
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			minSrcPos := s - maxCopy3Offset + 1
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if candidateL > minSrcPos && cv == valLong {
+				break
+			}
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			// Minimum length of a repeat. Tested with various values.
+			const wantRepeatBytes = 4
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+			if repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + wantRepeatBytes + checkRep
+				s += wantRepeatBytes + checkRep
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[candidate] {
+							s++
+							candidate++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				// Index in-between
+				index0 := base + 1
+				index1 := s - 2
+
+				for index0 < index1 {
+					cv0 := load64(src, index0)
+					cv1 := load64(src, index1)
+					lTable[hash7(cv0, lTableBits)] = uint32(index0)
+					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+					lTable[hash7(cv1, lTableBits)] = uint32(index1)
+					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+					index0 += 2
+					index1 -= 2
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if candidateL >= minSrcPos && uint32(cv) == uint32(valLong) {
+				break
+			}
+
+			// Check our short candidate
+			if candidateS >= minSrcPos && uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if candidateL > minSrcPos && uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		// Bail if the match is equal or worse to the encoding.
+		if offset > 65535 && s-base <= 4 && repeat != offset {
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		lits := src[nextEmit:base]
+		if len(lits) > 0 {
+			if offset <= maxCopy2Offset {
+				// 2 byte offsets.
+				// In rare cases, literal + copy1 will be smaller, but
+				// this is faster to decode and it is rare, so we accept that.
+				if len(lits) > maxCopy2Lits || offset < 64 {
+					d += emitLiteral(dst[d:], lits)
+					d += emitCopy(dst[d:], offset, s-base)
+				} else {
+					d += emitCopyLits2(dst[d:], lits, offset, s-base)
+				}
+			} else {
+				// 3 byte offset
+				if len(lits) > maxCopy3Lits {
+					d += emitLiteral(dst[d:], lits)
+					d += emitCopy(dst[d:], offset, s-base)
+				} else {
+					d += emitCopyLits3(dst[d:], lits, offset, s-base)
+				}
+			}
+		} else {
+			d += emitCopy(dst[d:], offset, s-base)
+		}
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+		// lTable could be postponed, but very minor difference.
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+var encLPool64K sync.Pool
+
+// encodeBlockBetterGo64K is a specialized version that handles inputs <= 64KB
+func encodeBlockBetterGo64K(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 15
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 12
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable *[maxLTableSize]uint16
+	if t := encLPool64K.Get(); t != nil {
+		lTable = t.(*[maxLTableSize]uint16)
+		*lTable = [maxLTableSize]uint16{}
+	} else {
+		lTable = new([maxLTableSize]uint16)
+	}
+	defer encLPool64K.Put(lTable)
+	var sTable [maxSTableSize]uint16
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := 1
+
+	if debugEncode {
+		fmt.Println("encodeBlockBetterGo64K: Starting encode")
+	}
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>7 + 1
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash6(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint16(s)
+			sTable[hashS] = uint16(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if cv == valLong {
+				break
+			}
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			// Minimum length of a repeat. Tested with various values.
+			const wantRepeatBytes = 4
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+			if repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + wantRepeatBytes + checkRep
+				s += wantRepeatBytes + checkRep
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[candidate] {
+							s++
+							candidate++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				// Index in-between
+				index0 := base + 1
+				index1 := s - 2
+
+				for index0 < index1 {
+					cv0 := load64(src, index0)
+					cv1 := load64(src, index1)
+					lTable[hash6(cv0, lTableBits)] = uint16(index0)
+					sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
+
+					lTable[hash6(cv1, lTableBits)] = uint16(index1)
+					sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
+					index0 += 2
+					index1 -= 2
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash6(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint16(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		lits := src[nextEmit:base]
+		if len(lits) > 0 {
+			// 2 byte offsets.
+			// In rare cases, literal + copy1 will be smaller, but
+			// this is faster to decode and it is rare, so we accept that.
+			if len(lits) > maxCopy2Lits || offset < 64 {
+				d += emitLiteral(dst[d:], lits)
+				d += emitCopy(dst[d:], offset, s-base)
+			} else {
+				d += emitCopyLits2(dst[d:], lits, offset, s-base)
+			}
+		} else {
+			d += emitCopy(dst[d:], offset, s-base)
+		}
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash6(cv0, lTableBits)] = uint16(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
+
+		// lTable could be postponed, but very minor difference.
+		lTable[hash6(cv1, lTableBits)] = uint16(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash6(load64(src, index0), lTableBits)] = uint16(index0)
+			lTable[hash6(load64(src, index2), lTableBits)] = uint16(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+/*
+// encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterDict(dst, src []byte, dict *dict) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	// Initialize the hash tables.
+	const (
+		// Long hash matches.
+		lTableBits    = 17
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 14
+		maxSTableSize = 1 << sTableBits
+
+		maxAhead = 8 // maximum bytes ahead without checking sLimit
+
+		debug = false
+	)
+
+	sLimit := len(src) - inputMargin
+	if sLimit > maxDictSrcOffset-maxAhead {
+		sLimit = maxDictSrcOffset - maxAhead
+	}
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	dict.initBetter()
+
+	var lTable [maxLTableSize]uint32
+	var sTable [maxSTableSize]uint32
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 0
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := len(dict.dict) - dict.repeat
+
+	// While in dict
+searchDict:
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>7 + 1
+			if nextS > sLimit {
+				break searchDict
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			dictL := int(dict.betterTableLong[hashL])
+			dictS := int(dict.betterTableShort[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if s != 0 {
+				if cv == valLong {
+					goto emitMatch
+				}
+				if cv == valShort {
+					candidateL = candidateS
+					goto emitMatch
+				}
+			}
+
+			// Check dict repeat.
+			if repeat >= s+4 {
+				candidate := len(dict.dict) - repeat + s
+				if candidate > 0 && uint32(cv) == load32(dict.dict, candidate) {
+					// Extend back
+					base := s
+					for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
+						i--
+						base--
+					}
+					// Bail if we exceed the maximum size.
+					if d+(base-nextEmit) > dstLimit {
+						return 0
+					}
+
+					d += emitLiteral(dst[d:], src[nextEmit:base])
+					if debug && nextEmit != base {
+						fmt.Println("emitted ", base-nextEmit, "literals")
+					}
+					s += 4
+					candidate += 4
+					for candidate < len(dict.dict)-8 && s <= len(src)-8 {
+						if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
+							s += bits.TrailingZeros64(diff) >> 3
+							break
+						}
+						s += 8
+						candidate += 8
+					}
+					d += emitRepeat(dst[d:], s-base)
+					if debug {
+						fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
+					}
+					nextEmit = s
+					if s >= sLimit {
+						break searchDict
+					}
+					// Index in-between
+					index0 := base + 1
+					index1 := s - 2
+
+					cv = load64(src, s)
+					for index0 < index1 {
+						cv0 := load64(src, index0)
+						cv1 := load64(src, index1)
+						lTable[hash7(cv0, lTableBits)] = uint32(index0)
+						sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+						lTable[hash7(cv1, lTableBits)] = uint32(index1)
+						sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+						index0 += 2
+						index1 -= 2
+					}
+					continue
+				}
+			}
+			// Don't try to find match at s==0
+			if s == 0 {
+				cv = load64(src, nextS)
+				s = nextS
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
+				goto emitMatch
+			}
+
+			// Long dict...
+			if uint32(cv) == load32(dict.dict, dictL) {
+				candidateL = dictL
+				goto emitDict
+			}
+
+			// Check our short candidate
+			if uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					goto emitMatch
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				goto emitMatch
+			}
+			if uint32(cv) == load32(dict.dict, dictS) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					goto emitMatch
+				}
+				candidateL = dictS
+				goto emitDict
+			}
+			cv = load64(src, nextS)
+			s = nextS
+		}
+	emitDict:
+		{
+			if debug {
+				if load32(dict.dict, candidateL) != load32(src, s) {
+					panic("dict emit mismatch")
+				}
+			}
+			// Extend backwards.
+			// The top bytes will be rechecked to get the full match.
+			for candidateL > 0 && s > nextEmit && dict.dict[candidateL-1] == src[s-1] {
+				candidateL--
+				s--
+			}
+
+			// Bail if we exceed the maximum size.
+			if d+(s-nextEmit) > dstLimit {
+				return 0
+			}
+
+			// A 4-byte match has been found. We'll later see if more than 4 bytes
+			// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+			// them as literal bytes.
+
+			d += emitLiteral(dst[d:], src[nextEmit:s])
+			if debug && nextEmit != s {
+				fmt.Println("emitted ", s-nextEmit, "literals")
+			}
+			{
+				// Invariant: we have a 4-byte match at s, and no need to emit any
+				// literal bytes prior to s.
+				base := s
+				offset := s + (len(dict.dict)) - candidateL
+
+				// Extend the 4-byte match as long as possible.
+				s += 4
+				candidateL += 4
+				for s <= len(src)-8 && len(dict.dict)-candidateL >= 8 {
+					if diff := load64(src, s) ^ load64(dict.dict, candidateL); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidateL += 8
+				}
+
+				if repeat == offset {
+					if debug {
+						fmt.Println("emitted dict repeat, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
+					}
+					d += emitRepeat(dst[d:], s-base)
+				} else {
+					if debug {
+						fmt.Println("emitted dict copy, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
+					}
+					// Matches longer than 64 are split.
+					if s <= sLimit || s-base < 8 {
+						d += emitCopy(dst[d:], offset, s-base)
+					} else {
+						// Split to ensure we don't start a copy within next block.
+						d += emitCopy(dst[d:], offset, 4)
+						d += emitRepeat(dst[d:], s-base-4)
+					}
+					repeat = offset
+				}
+				if false {
+					// Validate match.
+					if s <= candidateL {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := dict.dict[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+
+				nextEmit = s
+				if s >= sLimit {
+					break searchDict
+				}
+
+				if d > dstLimit {
+					// Do we have space for more, if not bail.
+					return 0
+				}
+
+				// Index short & long
+				index0 := base + 1
+				index1 := s - 2
+
+				cv0 := load64(src, index0)
+				cv1 := load64(src, index1)
+				lTable[hash7(cv0, lTableBits)] = uint32(index0)
+				sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+				lTable[hash7(cv1, lTableBits)] = uint32(index1)
+				sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+				index0 += 1
+				index1 -= 1
+				cv = load64(src, s)
+
+				// index every second long in between.
+				for index0 < index1 {
+					lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+					lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+					index0 += 2
+					index1 -= 2
+				}
+			}
+			continue
+		}
+	emitMatch:
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", s-nextEmit, "literals")
+		}
+		if repeat == offset {
+			if debug {
+				fmt.Println("emitted match repeat, length", s-base, "offset:", offset, "s:", s)
+			}
+			d += emitRepeat(dst[d:], s-base)
+		} else {
+			if debug {
+				fmt.Println("emitted match copy, length", s-base, "offset:", offset, "s:", s)
+			}
+			d += emitCopy(dst[d:], offset, s-base)
+			repeat = offset
+		}
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+	// Search without dict:
+	if repeat > s {
+		repeat = 0
+	}
+
+	// No more dict
+	sLimit = len(src) - inputMargin
+	if s >= sLimit {
+		goto emitRemainder
+	}
+	cv = load64(src, s)
+	if debug {
+		fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
+	}
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>7 + 1
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint32(s)
+			sTable[hashS] = uint32(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if cv == valLong {
+				break
+			}
+			if cv == valShort {
+				candidateL = candidateS
+				break
+			}
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			// Minimum length of a repeat. Tested with various values.
+			const wantRepeatBytes = 4
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+			if cv&repeatMask == load64(src, s-repeat)&repeatMask {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + wantRepeatBytes + checkRep
+				s += wantRepeatBytes + checkRep
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[candidate] {
+							s++
+							candidate++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				// Index in-between
+				index0 := base + 1
+				index1 := s - 2
+
+				for index0 < index1 {
+					cv0 := load64(src, index0)
+					cv1 := load64(src, index1)
+					lTable[hash7(cv0, lTableBits)] = uint32(index0)
+					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+					lTable[hash7(cv1, lTableBits)] = uint32(index1)
+					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+					index0 += 2
+					index1 -= 2
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint32(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		if offset > 65535 && s-base <= 5 && repeat != offset {
+			// Bail if the match is equal or worse to the encoding.
+			s = nextS + 1
+			if s >= sLimit {
+				goto emitRemainder
+			}
+			cv = load64(src, s)
+			continue
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if repeat == offset {
+			d += emitRepeat(dst[d:], s-base)
+		} else {
+			d += emitCopy(dst[d:], offset, s-base)
+			repeat = offset
+		}
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint32(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+*/
diff --git a/vendor/github.com/minio/minlz/encode_l3.go b/vendor/github.com/minio/minlz/encode_l3.go
new file mode 100644
index 0000000000..436606ea29
--- /dev/null
+++ b/vendor/github.com/minio/minlz/encode_l3.go
@@ -0,0 +1,700 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"fmt"
+	"math"
+	"math/bits"
+	"sync"
+)
+
+// pools with hash tables for best encoding.
+var encBestLPool sync.Pool
+var encBestSPool sync.Pool
+
+// encodeBlockBest encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBest(dst, src []byte, dict *dict) (d int) {
+	// Initialize the hash tables.
+	// TODO: dict
+	const (
+		// Long hash matches.
+		lTableBits    = 20
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 18
+		maxSTableSize = 1 << sTableBits
+
+		inputMargin = 8 + 2
+
+		debug = debugEncode
+	)
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	sLimitDict := len(src) - inputMargin
+	if sLimitDict > maxDictSrcOffset-inputMargin {
+		sLimitDict = maxDictSrcOffset - inputMargin
+	}
+
+	var lTable *[maxLTableSize]uint64
+	if t := encBestLPool.Get(); t != nil {
+		lTable = t.(*[maxLTableSize]uint64)
+		*lTable = [maxLTableSize]uint64{}
+	} else {
+		lTable = new([maxLTableSize]uint64)
+	}
+	defer encBestLPool.Put(lTable)
+
+	var sTable *[maxSTableSize]uint64
+	if t := encBestSPool.Get(); t != nil {
+		sTable = t.(*[maxSTableSize]uint64)
+		*sTable = [maxSTableSize]uint64{}
+	} else {
+		sTable = new([maxSTableSize]uint64)
+	}
+	defer encBestSPool.Put(sTable)
+
+	//var lTable [maxLTableSize]uint64
+	//var sTable [maxSTableSize]uint64
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	repeat := 1
+	if dict != nil {
+		//dict.initBest()
+		s = 0
+		repeat = len(dict.dict) - dict.repeat
+	}
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	const lowbitMask = 0xffffffff
+	getCur := func(x uint64) int {
+		return int(x & lowbitMask)
+	}
+	getPrev := func(x uint64) int {
+		return int(x >> 32)
+	}
+	const maxSkip = 64
+
+	if debugEncode {
+		fmt.Println("encodeBlockBest: Starting encode")
+	}
+	for {
+		type match struct {
+			offset    int
+			s         int
+			length    int
+			score     int
+			rep, dict bool
+			nextrep   bool
+		}
+		var best match
+		for {
+			// Next src position to check
+			nextS := (s-nextEmit)>>8 + 1
+			if nextS > maxSkip {
+				nextS = s + maxSkip
+			} else {
+				nextS += s
+			}
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			if dict != nil && s >= maxDictSrcOffset {
+				dict = nil
+				if repeat > s {
+					repeat = math.MinInt32
+				}
+			}
+			hashL := hash8(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL := lTable[hashL]
+			candidateS := sTable[hashS]
+
+			score := func(m match) int {
+				// Matches that are longer forward are penalized since we must emit it as a literal.
+				ll := m.s - nextEmit
+				// Bigger score is better.
+				// -m.s indicates the base cost.
+				score := m.length - emitLiteralSizeN(ll) - m.s
+				offset := m.s - m.offset
+				if m.rep {
+					return score - emitRepeatSize(m.length)
+				}
+
+				if ll > 0 && offset > 1024 {
+					// Check for fused discount
+					if ll <= maxCopy2Lits && offset < 65536+63 && m.length <= copy2LitMaxLen {
+						// 1-4 Literals can be embedded in copy2 without cost.
+						score++
+					} else if ll <= maxCopy3Lits {
+						// 0-3 Literals can be embedded in copy3 without cost.
+						score++
+					}
+				}
+				return score - emitCopySize(offset, m.length)
+			}
+
+			matchAt := func(offset, s int, first uint32) match {
+				if (best.length != 0 && best.s-best.offset == s-offset) || s-offset >= maxCopy3Offset || s <= offset {
+					// Don't retest if we have the same offset.
+					return match{offset: offset, s: s}
+				}
+				if debug && s == offset {
+					panic(offset)
+				}
+				if load32(src, offset) != first {
+					return match{offset: offset, s: s}
+				}
+
+				m := match{offset: offset, s: s, length: 4 + offset, rep: false}
+				s += 4
+
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[m.length] {
+							m.length++
+							s++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
+						m.length += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					m.length += 8
+				}
+				// Extend back...
+				for m.s > nextEmit && m.offset > 0 {
+					if src[m.offset-1] != src[m.s-1] {
+						break
+					}
+					m.s--
+					m.offset--
+					m.length++
+				}
+				m.length -= offset
+
+				m.score = score(m)
+				if m.score <= -m.s {
+					// Eliminate if no savings, we might find a better one.
+					m.length = 0
+				}
+				if m.s+m.length < sLimit {
+					const checkoff = 1
+					a, b := m.s+m.length+checkoff, m.offset+m.length+checkoff
+					m.nextrep = load32(src, a) == load32(src, b)
+				}
+				return m
+			}
+			matchAtRepeat := func(offset, s int, first uint32) match {
+				if best.rep {
+					// Don't retest if we already have a repeat
+					return match{offset: offset, s: s}
+				}
+				// 2 gives close to no improvement,
+				// since it may just give 'literal -> len 2 repeat -> literal' section.
+				// which eats up the gains in overhead.
+				// 3 gives pretty consistent improvement
+				const checkbytes = 3
+				mask := uint32((1 << (8 * checkbytes)) - 1)
+				if load32(src, offset)&mask != first&mask {
+					return match{offset: offset, s: s}
+				}
+				m := match{offset: offset, s: s, length: checkbytes + offset, rep: true}
+				s += checkbytes
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[m.length] {
+							m.length++
+							s++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
+						m.length += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					m.length += 8
+				}
+				// Extend back...
+				for m.s > nextEmit && m.offset > 0 {
+					if src[m.offset-1] != src[m.s-1] {
+						break
+					}
+					m.s--
+					m.offset--
+					m.length++
+				}
+				m.length -= offset
+				if m.s+m.length < sLimit {
+					const checkoff = 1
+					a, b := m.s+m.length+checkoff, m.offset+m.length+checkoff
+					m.nextrep = load32(src, a) == load32(src, b)
+				}
+				m.score = score(m)
+				if debug && m.length > 0 && m.length < 3 {
+					fmt.Println("repeat", m.length, "offset", m.offset, "s", m.s, "score", m.score, "first", first, "mask", mask, "src", src[m.offset:m.offset+m.length], "src", src[m.s:m.s+m.length])
+				}
+				return m
+			}
+			matchDict := func(candidate, s int, first uint32, rep bool) match {
+				if s >= maxDictSrcOffset {
+					return match{offset: candidate, s: s}
+				}
+				// Calculate offset as if in continuous array with s
+				offset := -len(dict.dict) + candidate
+				if best.length != 0 && best.s-best.offset == s-offset && !rep {
+					// Don't retest if we have the same offset.
+					return match{offset: offset, s: s}
+				}
+
+				if load32(dict.dict, candidate) != first {
+					return match{offset: offset, s: s}
+				}
+				m := match{offset: offset, s: s, length: 4 + candidate, rep: rep, dict: true}
+				s += 4
+				if !rep {
+					for s < sLimitDict && m.length < len(dict.dict) {
+						if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
+							if src[s] == dict.dict[m.length] {
+								m.length++
+								s++
+								continue
+							}
+							break
+						}
+						if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
+							m.length += bits.TrailingZeros64(diff) >> 3
+							break
+						}
+						s += 8
+						m.length += 8
+					}
+				} else {
+					for s < len(src) && m.length < len(dict.dict) {
+						if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
+							if src[s] == dict.dict[m.length] {
+								m.length++
+								s++
+								continue
+							}
+							break
+						}
+						if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
+							m.length += bits.TrailingZeros64(diff) >> 3
+							break
+						}
+						s += 8
+						m.length += 8
+					}
+				}
+				m.length -= candidate
+				m.score = score(m)
+				if m.score <= -m.s {
+					// Eliminate if no savings, we might find a better one.
+					m.length = 0
+				}
+				return m
+			}
+
+			bestOf := func(a, b match) match {
+				if b.length == 0 {
+					return a
+				}
+				if a.length == 0 {
+					return b
+				}
+				if a.score > b.score {
+					return a
+				}
+				if b.score > a.score {
+					return b
+				}
+
+				// Pick whichever starts the earliest,
+				// we can probably find a match right away
+				if a.s != b.s {
+					if a.s < b.s {
+						return a
+					}
+					return b
+				}
+				// If one is a good repeat candidate, pick it.
+				if a.nextrep != b.nextrep {
+					if a.nextrep {
+						return a
+					}
+					return b
+				}
+				// Pick the smallest distance offset.
+				if a.offset > b.offset {
+					return a
+				}
+				return b
+			}
+
+			if s > 0 {
+				best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv)))
+				best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv)))
+				best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv)))
+			}
+			if dict != nil {
+				candidateL := dict.bestTableLong[hashL]
+				candidateS := dict.bestTableShort[hashS]
+				best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
+				best = bestOf(best, matchDict(int(candidateL>>16), s, uint32(cv), false))
+				best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
+				best = bestOf(best, matchDict(int(candidateS>>16), s, uint32(cv), false))
+			}
+			{
+				if dict == nil || repeat <= s {
+					best = bestOf(best, matchAtRepeat(s-repeat, s, uint32(cv)))
+					best = bestOf(best, matchAtRepeat(s-repeat+1, s+1, uint32(cv>>8)))
+				} else if s-repeat < -4 && dict != nil {
+					candidate := len(dict.dict) - (repeat - s)
+					best = bestOf(best, matchDict(candidate, s, uint32(cv), true))
+					candidate++
+					best = bestOf(best, matchDict(candidate, s+1, uint32(cv>>8), true))
+				}
+
+				if best.length > 0 {
+					hashS := hash4(cv>>8, sTableBits)
+					// s+1
+					nextShort := sTable[hashS]
+					sFwd := s + 1
+					cv := load64(src, sFwd)
+					hashL := hash8(cv, lTableBits)
+					nextLong := lTable[hashL]
+					best = bestOf(best, matchAt(getCur(nextShort), sFwd, uint32(cv)))
+					best = bestOf(best, matchAt(getPrev(nextShort), sFwd, uint32(cv)))
+					best = bestOf(best, matchAt(getCur(nextLong), sFwd, uint32(cv)))
+					best = bestOf(best, matchAt(getPrev(nextLong), sFwd, uint32(cv)))
+
+					// dict at + 1
+					if dict != nil {
+						candidateL := dict.bestTableLong[hashL]
+						candidateS := dict.bestTableShort[hashS]
+
+						best = bestOf(best, matchDict(int(candidateL&0xffff), sFwd, uint32(cv), false))
+						best = bestOf(best, matchDict(int(candidateS&0xffff), sFwd, uint32(cv), false))
+					}
+
+					// s+2
+					if true {
+						sFwd++
+						cv = load64(src, sFwd)
+						hashL := hash8(cv, lTableBits)
+						nextLong = lTable[hashL]
+
+						if dict == nil || repeat <= sFwd {
+							// Repeat at + 2
+							best = bestOf(best, matchAtRepeat(sFwd-repeat, sFwd, uint32(cv)))
+						} else if repeat-sFwd > 4 && dict != nil {
+							candidate := len(dict.dict) - (repeat - sFwd)
+							best = bestOf(best, matchDict(candidate, sFwd, uint32(cv), true))
+						}
+						if true {
+							hashS := hash4(cv, sTableBits)
+							nextShort = sTable[hashS]
+							best = bestOf(best, matchAt(getCur(nextShort), sFwd, uint32(cv)))
+							best = bestOf(best, matchAt(getPrev(nextShort), sFwd, uint32(cv)))
+						}
+						best = bestOf(best, matchAt(getCur(nextLong), sFwd, uint32(cv)))
+						best = bestOf(best, matchAt(getPrev(nextLong), sFwd, uint32(cv)))
+
+						// dict at +2
+						// Very small gain
+						if dict != nil {
+							candidateL := dict.bestTableLong[hashL]
+							candidateS := dict.bestTableShort[hashS]
+
+							best = bestOf(best, matchDict(int(candidateL&0xffff), sFwd, uint32(cv), false))
+							best = bestOf(best, matchDict(int(candidateS&0xffff), sFwd, uint32(cv), false))
+						}
+					}
+
+					// Search for a match at best match end, see if that is better.
+					// Allow some bytes at the beginning to mismatch.
+					// Sweet spot is around 1-2 bytes, but depends on input.
+					// The skipped bytes are tested in Extend backwards,
+					// and still picked up as part of the match if they do.
+					const skipBeginning = 2
+					const skipEnd = 1
+					if sAt := best.s + best.length - skipEnd; sAt < sLimit {
+
+						sBack := best.s + skipBeginning - skipEnd
+						backL := best.length - skipBeginning
+						// Load initial values
+						cv = load64(src, sBack)
+
+						// Grab candidates...
+						next := lTable[hash8(load64(src, sAt), lTableBits)]
+
+						if checkAt := getCur(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+						}
+						if checkAt := getPrev(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+						}
+						// Quite small gain, but generally a benefit on very compressible material.
+						if true {
+							next = sTable[hash4(load64(src, sAt), sTableBits)]
+							if checkAt := getCur(next) - backL; checkAt > 0 {
+								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+							}
+							if checkAt := getPrev(next) - backL; checkAt > 0 {
+								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
+							}
+						}
+					}
+				}
+			}
+
+			// Update table
+			lTable[hashL] = uint64(s) | candidateL<<32
+			sTable[hashS] = uint64(s) | candidateS<<32
+
+			if best.length > 0 {
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+		startIdx := s + 1
+		s = best.s
+
+		if debug && best.offset >= s {
+			panic(fmt.Errorf("t %d >= s %d", best.offset, s))
+		}
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := s - best.offset
+		s += best.length
+		// Bail if the match is equal or worse to the encoding.
+		if !best.rep && best.length <= 4 {
+			if offset > 65535 ||
+				// Output will almost always be the same, and decoding will be slightly slower.
+				// We might find a better match before end of these 4 bytes.
+				(offset > maxCopy1Offset && offset <= maxCopy2Offset && base-nextEmit > maxCopy2Lits) {
+				s = startIdx + 1
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				cv = load64(src, s)
+				continue
+			}
+		}
+		if debug && nextEmit != base {
+			fmt.Println("EMIT", base-nextEmit, "literals. base-after:", base)
+		}
+
+		if best.rep {
+			if debug {
+				fmt.Println("REPEAT, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
+			}
+			d += emitLiteral(dst[d:], src[nextEmit:base])
+			// same as `d := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+			d += emitRepeat(dst[d:], best.length)
+		} else {
+			lits := src[nextEmit:base]
+			if debug {
+				fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best, "lits:", len(lits))
+			}
+			if len(lits) > 0 {
+				if offset <= maxCopy2Offset {
+					// 1-2 byte offsets
+					if len(lits) > maxCopy2Lits || offset < 64 || (offset <= 1024 && best.length > copy2LitMaxLen) {
+						d += emitLiteral(dst[d:], lits)
+						if best.length > 18 && best.length <= 64 && offset >= 64 {
+							// Size is equal.
+							// Prefer Copy2, since it decodes faster
+							d += encodeCopy2(dst[d:], offset, best.length)
+						} else {
+							d += emitCopy(dst[d:], offset, best.length)
+						}
+					} else {
+						if best.length > 11 {
+							// We are emitting remaining as a separate repeat.
+							// We might as well do a search for a better match.
+							d += emitCopyLits2(dst[d:], lits, offset, 11)
+							s = best.s + 11
+						} else {
+							d += emitCopyLits2(dst[d:], lits, offset, best.length)
+						}
+					}
+				} else {
+					// 3 byte offset
+					if len(lits) > maxCopy3Lits {
+						d += emitLiteral(dst[d:], lits)
+						d += emitCopy(dst[d:], offset, best.length)
+					} else {
+						d += emitCopyLits3(dst[d:], lits, offset, best.length)
+					}
+				}
+			} else {
+				if best.length > 18 && best.length <= 64 && offset >= 64 && offset <= maxCopy2Offset {
+					// Size is equal.
+					// Prefer Copy2, since it decodes faster
+					d += encodeCopy2(dst[d:], offset, best.length)
+				} else {
+					d += emitCopy(dst[d:], offset, best.length)
+				}
+			}
+		}
+		repeat = offset
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+		// Fill tables...
+		for i := startIdx; i < s; i++ {
+			cv0 := load64(src, i)
+			long0 := hash8(cv0, lTableBits)
+			short0 := hash4(cv0, sTableBits)
+			lTable[long0] = uint64(i) | lTable[long0]<<32
+			sTable[short0] = uint64(i) | sTable[short0]<<32
+		}
+		cv = load64(src, s)
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		litLen := len(src) - nextEmit
+		if d+litLen+emitLiteralSizeN(litLen) > dstLimit {
+			if debug && nextEmit != s {
+				fmt.Println("emitting would exceed dstLimit. Not compressing")
+			}
+			return 0
+		}
+		if debug && nextEmit != s {
+			fmt.Println("emitted ", len(src)-nextEmit, "literals")
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// emitCopySize returns the size to encode the offset+length
+//
+// It assumes that:
+//
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
+func emitCopySize(offset, length int) int {
+	if offset > 65536+63 {
+		// 3 Byte offset + Variable length (base length 4).
+		length -= 64 // Base is free. We can add 64 for free.
+		if length <= 0 {
+			return 4
+		}
+		return 4 + (bits.Len(uint(length))+7)/8
+	}
+
+	// Offset no more than 2 bytes.
+	if offset <= 1024 {
+		if length <= 18 {
+			// Emit up to 18 bytes with short offset.
+			return 2
+		}
+		if length < 18+256 {
+			return 3
+		}
+		// Worst case we have to emit a repeat for the rest
+		return 2 + emitRepeatSize(length-18)
+	}
+	// 2 byte offset + Variable length (base length 4).
+	return emitCopy2Size(length)
+}
+
+// emitRepeatSize returns the number of bytes required to encode a repeat.
+// Length must be at least 1 and < 1<<24
+func emitRepeatSize(length int) int {
+	if length <= 0 {
+		return 0
+	}
+
+	if length <= 29 {
+		return 1
+	}
+	length -= 29
+	if length <= 256 {
+		return 2
+	}
+	if length <= 65536 {
+		return 3
+	}
+	return 4
+}
+
+// emitCopy2Size returns the number of bytes required to encode a copy2.
+// Length must be less than 1<<24
+func emitCopy2Size(length int) int {
+	length -= 4
+
+	if length <= 60 {
+		// Length inside tag.
+		return 3
+	}
+	length -= 60
+	if length < 256 {
+		// Length in 1 byte.
+		return 4
+	}
+	if length < 65536 {
+		// Length in 2 bytes.
+		return 5
+	}
+	// Length in 3 bytes.
+	return 6
+}
diff --git a/vendor/github.com/minio/minlz/index.go b/vendor/github.com/minio/minlz/index.go
new file mode 100644
index 0000000000..846cfb7e40
--- /dev/null
+++ b/vendor/github.com/minio/minlz/index.go
@@ -0,0 +1,636 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"bytes"
+	"encoding/binary"
+	"encoding/json"
+	"fmt"
+	"io"
+	"sort"
+)
+
+const (
+	IndexHeader     = "s2idx\x00"
+	IndexTrailer    = "\x00xdi2s"
+	maxIndexEntries = 1 << 16
+	minIndexDist    = 1 << 20 // Minimum uncompressed distance between entries
+)
+
+// Index represents an S2/Snappy/MinLZ index.
+type Index struct {
+	// Total Uncompressed size.
+	TotalUncompressed int64
+
+	// Total Compressed size if known. Will be -1 if unknown.
+	TotalCompressed int64
+
+	// Offset pairs are pairs of Compressed -> Uncompressed positions.
+	// Offsets are stream offsets from first stream byte.
+	// It will be safe to start decompressing from any of these offsets.
+	// The slice is sorted by offset.
+	Offsets []OffsetPair
+
+	estBlockUncomp int64
+}
+
+type OffsetPair struct {
+	CompressedOffset   int64
+	UncompressedOffset int64
+}
+
+func (i *Index) reset(maxBlock int) {
+	if i == nil {
+		return
+	}
+	for maxBlock < minIndexDist {
+		maxBlock *= 2
+	}
+	i.estBlockUncomp = int64(maxBlock)
+	i.TotalCompressed = -1
+	i.TotalUncompressed = -1
+	if len(i.Offsets) > 0 {
+		i.Offsets = i.Offsets[:0]
+	}
+}
+
+// allocInfos will allocate an empty slice of infos.
+func (i *Index) allocInfos(n int) {
+	if n > maxIndexEntries {
+		panic("n > maxIndexEntries")
+	}
+	i.Offsets = make([]OffsetPair, 0, n)
+}
+
+// add an uncompressed and compressed pair.
+// Entries must be sent in order.
+func (i *Index) add(compressedOffset, uncompressedOffset int64) error {
+	if i == nil {
+		return nil
+	}
+	lastIdx := len(i.Offsets) - 1
+	if lastIdx >= 0 {
+		latest := i.Offsets[lastIdx]
+		if uncompressedOffset-latest.UncompressedOffset < i.estBlockUncomp {
+			// Don't add until we have i.estBlockUncomp
+			return nil
+		}
+		if latest.UncompressedOffset > uncompressedOffset {
+			return fmt.Errorf("internal error: Earlier uncompressed received (%d > %d)", latest.UncompressedOffset, uncompressedOffset)
+		}
+		if latest.CompressedOffset > compressedOffset {
+			return fmt.Errorf("internal error: Earlier compressed received (%d > %d)", latest.UncompressedOffset, uncompressedOffset)
+		}
+	}
+	i.Offsets = append(i.Offsets, OffsetPair{CompressedOffset: compressedOffset, UncompressedOffset: uncompressedOffset})
+	if len(i.Offsets) > maxIndexEntries {
+		// Keep memory from exploding.
+		i.reduceLight()
+	}
+	return nil
+}
+
+// Find the offset at or before the wanted (uncompressed) offset.
+// If offset is 0 or positive it is the offset from the beginning of the file.
+// If the uncompressed size is known, the offset must be within the file.
+// If an offset outside the file is requested io.ErrUnexpectedEOF is returned.
+// If the offset is negative, it is interpreted as the distance from the end of the file,
+// where -1 represents the last byte.
+// If offset from the end of the file is requested, but size is unknown,
+// ErrUnsupported will be returned.
+func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err error) {
+	if i.TotalUncompressed < 0 {
+		return 0, 0, ErrCorrupt
+	}
+	if offset < 0 {
+		offset = i.TotalUncompressed + offset
+		if offset < 0 {
+			return 0, 0, io.ErrUnexpectedEOF
+		}
+	}
+	if offset > i.TotalUncompressed {
+		return 0, 0, io.ErrUnexpectedEOF
+	}
+	if len(i.Offsets) > 200 {
+		n := sort.Search(len(i.Offsets), func(n int) bool {
+			return i.Offsets[n].UncompressedOffset > offset
+		})
+		if n == 0 {
+			n = 1
+		}
+		return i.Offsets[n-1].CompressedOffset, i.Offsets[n-1].UncompressedOffset, nil
+	}
+	for _, info := range i.Offsets {
+		if info.UncompressedOffset > offset {
+			break
+		}
+		compressedOff = info.CompressedOffset
+		uncompressedOff = info.UncompressedOffset
+	}
+	return compressedOff, uncompressedOff, nil
+}
+
+// reduce to stay below maxIndexEntries
+func (i *Index) reduce() {
+	if len(i.Offsets) < maxIndexEntries {
+		return
+	}
+
+	// Algorithm, keep 1, remove removeN entries...
+	removeN := (len(i.Offsets) + 1) / maxIndexEntries
+	src := i.Offsets
+	j := 0
+
+	// Each block should be at least 1MB, but don't reduce below 1000 entries.
+	for i.estBlockUncomp*(int64(removeN)+1) < minIndexDist && len(i.Offsets)/(removeN+1) > 1000 {
+		removeN++
+	}
+	for idx := 0; idx < len(src); idx++ {
+		i.Offsets[j] = src[idx]
+		j++
+		idx += removeN
+	}
+	i.Offsets = i.Offsets[:j]
+	// Update maxblock estimate.
+	i.estBlockUncomp += i.estBlockUncomp * int64(removeN)
+}
+
+// reduce to stay below maxIndexEntries
+func (i *Index) reduceLight() {
+	i.estBlockUncomp *= 2
+	src := i.Offsets
+	var j int
+	for idx := 0; idx < len(src); idx++ {
+		base := src[idx]
+		i.Offsets[j] = base
+		j++
+		for idx < len(src) && src[idx].UncompressedOffset-base.UncompressedOffset < i.estBlockUncomp {
+			idx++
+		}
+	}
+	i.Offsets = i.Offsets[:j]
+}
+
+func (i *Index) appendTo(b []byte, uncompTotal, compTotal int64) []byte {
+	if i == nil {
+		return nil
+	}
+	i.reduce()
+	var tmp [binary.MaxVarintLen64]byte
+
+	initSize := len(b)
+	// We make the start a skippable header+size.
+	b = append(b, chunkTypeIndex, 0, 0, 0)
+	b = append(b, []byte(IndexHeader)...)
+	// Total Uncompressed size
+	n := binary.PutVarint(tmp[:], uncompTotal)
+	b = append(b, tmp[:n]...)
+	// Total Compressed size
+	n = binary.PutVarint(tmp[:], compTotal)
+	b = append(b, tmp[:n]...)
+	// Put EstBlockUncomp size
+	n = binary.PutVarint(tmp[:], i.estBlockUncomp)
+	b = append(b, tmp[:n]...)
+	// Put length
+	n = binary.PutVarint(tmp[:], int64(len(i.Offsets)))
+	b = append(b, tmp[:n]...)
+
+	// Check if we should add uncompressed offsets
+	var hasUncompressed byte
+	for idx, info := range i.Offsets {
+		if idx == 0 {
+			if info.UncompressedOffset != 0 {
+				hasUncompressed = 1
+				break
+			}
+			continue
+		}
+		if info.UncompressedOffset != i.Offsets[idx-1].UncompressedOffset+i.estBlockUncomp {
+			hasUncompressed = 1
+			break
+		}
+	}
+	b = append(b, hasUncompressed)
+
+	// Add each entry
+	if hasUncompressed == 1 {
+		for idx, info := range i.Offsets {
+			uOff := info.UncompressedOffset
+			if idx > 0 {
+				prev := i.Offsets[idx-1]
+				uOff -= prev.UncompressedOffset + (i.estBlockUncomp)
+			}
+			n = binary.PutVarint(tmp[:], uOff)
+			b = append(b, tmp[:n]...)
+		}
+	}
+
+	// Initial compressed size estimate.
+	cPredict := i.estBlockUncomp / 2
+
+	for idx, info := range i.Offsets {
+		cOff := info.CompressedOffset
+		if idx > 0 {
+			prev := i.Offsets[idx-1]
+			cOff -= prev.CompressedOffset + cPredict
+			// Update compressed size prediction, with half the error.
+			cPredict += cOff / 2
+		}
+		b = binary.AppendVarint(b, cOff)
+	}
+
+	// Add Total Size.
+	// Stored as fixed size for easier reading.
+	binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)-initSize+4+len(IndexTrailer)))
+	b = append(b, tmp[:4]...)
+	// Trailer
+	b = append(b, []byte(IndexTrailer)...)
+
+	// Update size
+	chunkLen := len(b) - initSize - skippableFrameHeader
+	b[initSize+1] = uint8(chunkLen >> 0)
+	b[initSize+2] = uint8(chunkLen >> 8)
+	b[initSize+3] = uint8(chunkLen >> 16)
+	//fmt.Printf("chunklen: 0x%x Uncomp:%d, Comp:%d\n", chunkLen, uncompTotal, compTotal)
+	return b
+}
+
+// Load a binary index.
+// A zero value Index can be used or a previous one can be reused.
+func (i *Index) Load(b []byte) ([]byte, error) {
+	if len(b) <= 4+len(IndexHeader)+len(IndexTrailer) {
+		return b, io.ErrUnexpectedEOF
+	}
+	if b[0] != chunkTypeIndex && b[0] != legacyIndexChunk {
+		return b, ErrCorrupt
+	}
+	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
+	b = b[4:]
+
+	// Validate we have enough...
+	if len(b) < chunkLen {
+		return b, io.ErrUnexpectedEOF
+	}
+	if !bytes.Equal(b[:len(IndexHeader)], []byte(IndexHeader)) {
+		return b, ErrUnsupported
+	}
+	b = b[len(IndexHeader):]
+
+	// Total Uncompressed
+	if v, n := binary.Varint(b); n <= 0 || v < 0 {
+		return b, ErrCorrupt
+	} else {
+		i.TotalUncompressed = v
+		b = b[n:]
+	}
+
+	// Total Compressed
+	if v, n := binary.Varint(b); n <= 0 {
+		return b, ErrCorrupt
+	} else {
+		i.TotalCompressed = v
+		b = b[n:]
+	}
+
+	// Read EstBlockUncomp
+	if v, n := binary.Varint(b); n <= 0 {
+		return b, ErrCorrupt
+	} else {
+		if v < 0 {
+			return b, ErrCorrupt
+		}
+		i.estBlockUncomp = v
+		b = b[n:]
+	}
+
+	var entries int
+	if v, n := binary.Varint(b); n <= 0 {
+		return b, ErrCorrupt
+	} else {
+		if v < 0 || v > maxIndexEntries {
+			return b, ErrCorrupt
+		}
+		entries = int(v)
+		b = b[n:]
+	}
+	if cap(i.Offsets) < entries {
+		i.allocInfos(entries)
+	}
+	i.Offsets = i.Offsets[:entries]
+
+	if len(b) < 1 {
+		return b, io.ErrUnexpectedEOF
+	}
+	hasUncompressed := b[0]
+	b = b[1:]
+	if hasUncompressed&1 != hasUncompressed {
+		return b, ErrCorrupt
+	}
+
+	// Add each uncompressed entry
+	for idx := range i.Offsets {
+		var uOff int64
+		if hasUncompressed != 0 {
+			// Load delta
+			if v, n := binary.Varint(b); n <= 0 {
+				return b, ErrCorrupt
+			} else {
+				uOff = v
+				b = b[n:]
+			}
+		}
+
+		if idx > 0 {
+			prev := i.Offsets[idx-1].UncompressedOffset
+			uOff += prev + (i.estBlockUncomp)
+			if uOff <= prev {
+				return b, ErrCorrupt
+			}
+		}
+		if uOff < 0 {
+			return b, ErrCorrupt
+		}
+		i.Offsets[idx].UncompressedOffset = uOff
+	}
+
+	// Initial compressed size estimate.
+	cPredict := i.estBlockUncomp / 2
+
+	// Add each compressed entry
+	for idx := range i.Offsets {
+		var cOff int64
+		if v, n := binary.Varint(b); n <= 0 {
+			return b, ErrCorrupt
+		} else {
+			cOff = v
+			b = b[n:]
+		}
+
+		if idx > 0 {
+			// Update compressed size prediction, with half the error.
+			cPredictNew := cPredict + cOff/2
+
+			prev := i.Offsets[idx-1].CompressedOffset
+			cOff += prev + cPredict
+			if cOff <= prev {
+				return b, ErrCorrupt
+			}
+			cPredict = cPredictNew
+		}
+		if cOff < 0 {
+			return b, ErrCorrupt
+		}
+		i.Offsets[idx].CompressedOffset = cOff
+	}
+	if len(b) < 4+len(IndexTrailer) {
+		return b, io.ErrUnexpectedEOF
+	}
+	// Skip size...
+	b = b[4:]
+
+	// Check trailer...
+	if !bytes.Equal(b[:len(IndexTrailer)], []byte(IndexTrailer)) {
+		return b, ErrCorrupt
+	}
+	return b[len(IndexTrailer):], nil
+}
+
+// LoadStream will load an index from the end of the supplied stream.
+// ErrUnsupported will be returned if the signature cannot be found.
+// ErrCorrupt will be returned if unexpected values are found.
+// io.ErrUnexpectedEOF is returned if there are too few bytes.
+// IO errors are returned as-is.
+func (i *Index) LoadStream(rs io.ReadSeeker) error {
+	// Go to end.
+	_, err := rs.Seek(-10, io.SeekEnd)
+	if err != nil {
+		return err
+	}
+	var tmp [10]byte
+	_, err = io.ReadFull(rs, tmp[:])
+	if err != nil {
+		return err
+	}
+	// Check trailer...
+	if !bytes.Equal(tmp[4:4+len(IndexTrailer)], []byte(IndexTrailer)) {
+		return ErrUnsupported
+	}
+	sz := binary.LittleEndian.Uint32(tmp[:4])
+	if sz > MaxUserChunkSize+skippableFrameHeader {
+		return ErrCorrupt
+	}
+	_, err = rs.Seek(-int64(sz), io.SeekEnd)
+	if err != nil {
+		return err
+	}
+
+	// Read index.
+	buf := make([]byte, sz)
+	_, err = io.ReadFull(rs, buf)
+	if err != nil {
+		return err
+	}
+	_, err = i.Load(buf)
+	return err
+}
+
+// IndexStream will return an index for a stream.
+// The stream structure will be checked, but
+// data within blocks is not verified.
+// The returned index can either be appended to the end of the stream
+// or stored separately.
+func IndexStream(r io.Reader) ([]byte, error) {
+	var i Index
+	var buf [MaxUserChunkSize]byte
+	var readHeader bool
+	for {
+		_, err := io.ReadFull(r, buf[:4])
+		if err != nil {
+			if err == io.EOF {
+				return i.appendTo(nil, i.TotalUncompressed, i.TotalCompressed), nil
+			}
+			return nil, err
+		}
+		// Start of this chunk.
+		startChunk := i.TotalCompressed
+		i.TotalCompressed += 4
+
+		chunkType := buf[0]
+		if !readHeader {
+			if chunkType != ChunkTypeStreamIdentifier && chunkType != chunkTypeEOF {
+				return nil, ErrCorrupt
+			}
+			readHeader = true
+		}
+		chunkLen := int(buf[1]) | int(buf[2])<<8 | int(buf[3])<<16
+		if chunkLen < checksumSize {
+			return nil, ErrCorrupt
+		}
+
+		i.TotalCompressed += int64(chunkLen)
+		_, err = io.ReadFull(r, buf[:chunkLen])
+		if err != nil {
+			return nil, io.ErrUnexpectedEOF
+		}
+
+		switch chunkType {
+		case chunkTypeLegacyCompressedData, chunkTypeMinLZCompressedData, chunkTypeMinLZCompressedDataCompCRC:
+			// Section 4.2. Compressed data (chunk type 0x00).
+			// Skip checksum.
+			dLen, err := DecodedLen(buf[checksumSize:])
+			if err != nil {
+				return nil, err
+			}
+			if dLen > maxBlockSize {
+				return nil, ErrCorrupt
+			}
+			if i.estBlockUncomp == 0 {
+				// Use first block for estimate...
+				i.estBlockUncomp = int64(dLen)
+			}
+			err = i.add(startChunk, i.TotalUncompressed)
+			if err != nil {
+				return nil, err
+			}
+			i.TotalUncompressed += int64(dLen)
+			continue
+		case chunkTypeUncompressedData:
+			n2 := chunkLen - checksumSize
+			if n2 > maxBlockSize {
+				return nil, ErrCorrupt
+			}
+			if i.estBlockUncomp == 0 {
+				// Use first block for estimate...
+				i.estBlockUncomp = int64(n2)
+			}
+			err = i.add(startChunk, i.TotalUncompressed)
+			if err != nil {
+				return nil, err
+			}
+			i.TotalUncompressed += int64(n2)
+			continue
+		case ChunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != magicBodyLen {
+				return nil, ErrCorrupt
+			}
+
+			if string(buf[:len(magicBody)]) != magicBody {
+				if string(buf[:len(magicBodyS2)]) != magicBodyS2 {
+					if string(buf[:magicBodyLen]) != magicBodySnappy {
+						return nil, ErrCorrupt
+					}
+				}
+			}
+			continue
+		case chunkTypeEOF:
+			continue
+		}
+
+		if chunkType <= maxNonSkippableChunk {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x03-0x3f).
+			fmt.Println("UN:", chunkType)
+			return nil, ErrUnsupported
+		}
+		// Skip user chunks and padding.
+	}
+}
+
+// JSON returns the index as JSON text.
+func (i *Index) JSON() []byte {
+	type offset struct {
+		CompressedOffset   int64 `json:"compressed"`
+		UncompressedOffset int64 `json:"uncompressed"`
+	}
+	x := struct {
+		TotalUncompressed int64    `json:"total_uncompressed"` // Total Uncompressed size if known. Will be -1 if unknown.
+		TotalCompressed   int64    `json:"total_compressed"`   // Total Compressed size if known. Will be -1 if unknown.
+		Offsets           []offset `json:"offsets"`
+		EstBlockUncomp    int64    `json:"est_block_uncompressed"`
+	}{
+		TotalUncompressed: i.TotalUncompressed,
+		TotalCompressed:   i.TotalCompressed,
+		EstBlockUncomp:    i.estBlockUncomp,
+	}
+	for _, v := range i.Offsets {
+		x.Offsets = append(x.Offsets, offset{CompressedOffset: v.CompressedOffset, UncompressedOffset: v.UncompressedOffset})
+	}
+	b, _ := json.MarshalIndent(x, "", "  ")
+	return b
+}
+
+// RemoveIndexHeaders will trim all headers and trailers from a given index.
+// This is expected to save 20 bytes.
+// These can be restored using RestoreIndexHeaders.
+// This removes a layer of security, but is the most compact representation.
+// Returns nil if headers contains errors.
+// The returned slice references the provided slice.
+func RemoveIndexHeaders(b []byte) []byte {
+	const save = 4 + len(IndexHeader) + len(IndexTrailer) + 4
+	if len(b) <= save {
+		return nil
+	}
+	if b[0] != chunkTypeIndex {
+		return nil
+	}
+	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
+	b = b[4:]
+
+	// Validate we have enough...
+	if len(b) < chunkLen {
+		return nil
+	}
+	b = b[:chunkLen]
+
+	if !bytes.Equal(b[:len(IndexHeader)], []byte(IndexHeader)) {
+		return nil
+	}
+	b = b[len(IndexHeader):]
+	if !bytes.HasSuffix(b, []byte(IndexTrailer)) {
+		return nil
+	}
+	b = bytes.TrimSuffix(b, []byte(IndexTrailer))
+
+	if len(b) < 4 {
+		return nil
+	}
+	return b[:len(b)-4]
+}
+
+// RestoreIndexHeaders will index restore headers removed by RemoveIndexHeaders.
+// No error checking is performed on the input.
+// If a 0 length slice is sent, it is returned without modification.
+func RestoreIndexHeaders(in []byte) []byte {
+	if len(in) == 0 {
+		return in
+	}
+	b := make([]byte, 0, 4+len(IndexHeader)+len(in)+len(IndexTrailer)+4)
+	b = append(b, chunkTypeIndex, 0, 0, 0)
+	b = append(b, []byte(IndexHeader)...)
+	b = append(b, in...)
+
+	// Size of block as uint32
+	b = binary.LittleEndian.AppendUint32(b, uint32(len(b)+4+len(IndexTrailer)))
+
+	// Trailer
+	b = append(b, []byte(IndexTrailer)...)
+
+	chunkLen := len(b) - skippableFrameHeader
+	b[1] = uint8(chunkLen >> 0)
+	b[2] = uint8(chunkLen >> 8)
+	b[3] = uint8(chunkLen >> 16)
+	return b
+}
diff --git a/vendor/github.com/minio/minlz/internal/race/norace.go b/vendor/github.com/minio/minlz/internal/race/norace.go
new file mode 100644
index 0000000000..67d0699e3c
--- /dev/null
+++ b/vendor/github.com/minio/minlz/internal/race/norace.go
@@ -0,0 +1,25 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !race
+
+package race
+
+const Enabled = false
+
+func ReadSlice[T any](s []T) {
+}
+
+func WriteSlice[T any](s []T) {
+}
diff --git a/vendor/github.com/minio/minlz/internal/race/race.go b/vendor/github.com/minio/minlz/internal/race/race.go
new file mode 100644
index 0000000000..96282873f4
--- /dev/null
+++ b/vendor/github.com/minio/minlz/internal/race/race.go
@@ -0,0 +1,38 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build race
+
+package race
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+const Enabled = true
+
+func ReadSlice[T any](s []T) {
+	if len(s) == 0 {
+		return
+	}
+	runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0])))
+}
+
+func WriteSlice[T any](s []T) {
+	if len(s) == 0 {
+		return
+	}
+	runtime.RaceWriteRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0])))
+}
diff --git a/vendor/github.com/minio/minlz/lz4convert.go b/vendor/github.com/minio/minlz/lz4convert.go
new file mode 100644
index 0000000000..05460c657c
--- /dev/null
+++ b/vendor/github.com/minio/minlz/lz4convert.go
@@ -0,0 +1,454 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+)
+
+// lZ4Converter provides conversion from LZ4 blocks as defined here:
+// https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md
+type lZ4Converter struct {
+}
+
+// errDstTooSmall is returned when provided destination is too small.
+var errDstTooSmall = errors.New("minlz: destination too small")
+
+// errIncompressible is returned when the block is incompressible.
+var errIncompressible = errors.New("minlz: incompressible")
+
+// ConvertBlock will convert an LZ4 block and append it as an MinLZ
+// block without a block length to dst.
+// The uncompressed size is returned as well.
+// dst must have capacity to contain the entire compressed block,
+// which may exceed MaxEncodedLen().
+func (l *lZ4Converter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
+	if len(src) == 0 {
+		return dst, 0, nil
+	}
+	const debug = false
+	const lz4MinMatch = 4
+	const inlineLits = true
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	s, d := 0, len(dst)
+	dst = dst[:cap(dst)]
+	if !debug && hasAsm {
+		res, sz := cvtLZ4BlockAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt        = -1
+				errDstTooSmallRet = -2
+			)
+			switch res {
+			case errCorrupt:
+				return nil, 0, ErrCorrupt
+			case errDstTooSmallRet:
+				return nil, 0, errDstTooSmall
+			default:
+				return nil, 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if res < sz {
+			return nil, 0, errIncompressible
+		}
+		if d+sz > len(dst) {
+			return nil, 0, errDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
+	dLimit := len(dst) - 10
+	dStart := d
+
+	var lastOffset uint16
+	lastOffset = 1
+	var uncompressed int
+	if debug {
+		fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
+	}
+
+	for {
+		if s >= len(src) {
+			return dst[:d], 0, ErrCorrupt
+		}
+		if uncompressed > MaxBlockSize {
+			return dst[:d], 0, ErrTooLarge
+		}
+		// Read literal info
+		token := src[s]
+		ll := int(token >> 4)
+		ml := int(lz4MinMatch + (token & 0xf))
+
+		// If upper nibble is 15, literal length is extended
+		if token >= 0xf0 {
+			for {
+				s++
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return dst[:d], 0, ErrCorrupt
+				}
+				val := src[s]
+				ll += int(val)
+				if val != 255 {
+					break
+				}
+			}
+		}
+		// Skip past token
+		if s+ll >= len(src) {
+			if debug {
+				fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
+			}
+			return nil, 0, ErrCorrupt
+		}
+		s++
+		var lits []byte
+		if ll > 0 {
+			if d+ll > dLimit {
+				if debug {
+					fmt.Printf("ERR: emit %d literals, d:%d, dLimit: %d\n", ll, d, dLimit)
+				}
+				return nil, 0, errDstTooSmall
+			}
+			if debug {
+				fmt.Printf("emit %d literals, pos:%d\n", ll, uncompressed)
+			}
+			lits = src[s : s+ll]
+			s += ll
+		}
+
+		// Check if we are done...
+		if s == len(src) && ml == lz4MinMatch {
+			if uncompressed+ll > MaxBlockSize {
+				return dst[:d], 0, ErrTooLarge
+			}
+			uncompressed += ll
+			d += emitLiteral(dst[d:], lits)
+			break
+		}
+		// 2 byte offset
+		if s >= len(src)-2 {
+			if debug {
+				fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		offset := load16(src, s)
+		isRepeat := offset == lastOffset
+		if len(lits) > 0 {
+			// There are no offset >64K, so copy3 doesn't apply.
+			if !inlineLits || len(lits) > maxCopy2Lits ||
+				(offset <= 1024 && ml > copy2LitMaxLen) || // Comment out for speed.
+				offset < 64 ||
+				isRepeat {
+				d += emitLiteral(dst[d:], lits)
+				lits = nil
+			}
+			uncompressed += ll
+		}
+
+		s += 2
+		if offset == 0 {
+			if debug {
+				fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
+			}
+			return nil, 0, ErrCorrupt
+		}
+		if int(offset) > uncompressed {
+			if debug {
+				fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
+			}
+			return nil, 0, ErrCorrupt
+		}
+
+		if ml == lz4MinMatch+15 {
+			for {
+				if s >= len(src) {
+					if debug {
+						fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+					}
+					return nil, 0, ErrCorrupt
+				}
+				val := src[s]
+				s++
+				ml += int(val)
+				if val != 255 {
+					if s >= len(src) {
+						if debug {
+							fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
+						}
+						return nil, 0, ErrCorrupt
+					}
+					break
+				}
+			}
+		}
+		if isRepeat {
+			if debug {
+				fmt.Printf("emit repeat, length: %d, offset: %d, pos:%d\n", ml, offset, uncompressed)
+			}
+			d += emitRepeat(dst[d:], ml)
+		} else {
+			if len(lits) > 0 {
+				if debug {
+					fmt.Printf("emit %d lits + copy, length: %d, offset: %d, pos:%d\n", len(lits), ml, offset, uncompressed)
+				}
+				d += emitCopyLits2(dst[d:], lits, int(offset), ml)
+			} else {
+				if debug {
+					fmt.Printf("emit copy, length: %d, offset: %d, pos:%d\n", ml, offset, uncompressed)
+				}
+				d += emitCopy(dst[d:], int(offset), ml)
+			}
+			lastOffset = offset
+		}
+		uncompressed += ml
+		if d > dLimit {
+			return nil, 0, errDstTooSmall
+		}
+	}
+	if uncompressed < d-dStart {
+		return nil, 0, errIncompressible
+	}
+	return dst[:d], uncompressed, nil
+}
+
+func (l *lZ4Converter) ConvertStream(w io.Writer, r io.Reader) error {
+	var tmp [4]byte
+	const debug = false
+	for {
+		// Read magic
+		_, err := io.ReadFull(r, tmp[:4])
+		if err != nil {
+			if err == io.EOF {
+				return nil
+			}
+			return err
+		}
+		if binary.LittleEndian.Uint32(tmp[:4]) != 0x184D2204 {
+			return fmt.Errorf("minlz: invalid lz4 magic: %x", tmp[:4])
+		}
+
+		// Read Frame Descriptor
+		_, err = io.ReadFull(r, tmp[:2])
+		if err != nil {
+			if err == io.EOF {
+				return io.ErrUnexpectedEOF
+			}
+			return err
+		}
+		if tmp[0]&(1<<3) != 0 {
+			// Content Size - ignore
+			var tmp2 [8]byte
+			_, err = io.ReadFull(r, tmp2[:8])
+			if err != nil {
+				if err == io.EOF {
+					return io.ErrUnexpectedEOF
+				}
+				return err
+			}
+		}
+		if tmp[0]&(1<<0) != 0 {
+			// DictID - fail if set
+			var tmp2 [4]byte
+			_, err = io.ReadFull(r, tmp2[:2])
+			if err != nil {
+				if err == io.EOF {
+					return io.ErrUnexpectedEOF
+				}
+				return err
+			}
+			if tmp2 != [4]byte{0, 0, 0, 0} {
+				return fmt.Errorf("minlz: dictID not supported")
+			}
+		}
+		// Version
+		if tmp[0]>>6 != 1 {
+			return fmt.Errorf("minlz: unknown version: %d %d", tmp[0]>>6, tmp[0])
+		}
+		// Block Independence
+		if tmp[0]&(1<<5) == 0 {
+			return fmt.Errorf("minlz: block dependence not supported")
+		}
+		blockCrc := tmp[0]&(1<<4) != 0
+		contentCrc := tmp[0]&(1<<2) != 0
+		maxBlockSz := 0
+		// Block Maximum Size
+		bz := int(tmp[1] >> 4 & 0x7)
+		switch bz {
+		case 4:
+			maxBlockSz = 64 << 10
+		case 5:
+			maxBlockSz = 256 << 10
+		case 6:
+			maxBlockSz = 1 << 20
+		case 7:
+			maxBlockSz = 4 << 20
+		default:
+			return fmt.Errorf("minlz: invalid block size: %d", bz)
+		}
+		// Header Checksum
+		_, err = io.ReadFull(r, tmp[:1])
+		if err != nil {
+			if err == io.EOF {
+				return io.ErrUnexpectedEOF
+			}
+			return err
+		}
+		var n int
+		n, err = w.Write(makeHeader(maxBlockSz))
+		if err != nil {
+			return err
+		}
+		if n != len(magicChunk)+1 {
+			return io.ErrShortWrite
+		}
+
+		block := make([]byte, maxBlockSz)
+		dst := make([]byte, maxBlockSz)
+		uncompSize := MaxEncodedLen(maxBlockSz)
+		if debug {
+			fmt.Println("hasCrc:", blockCrc)
+		}
+		for {
+			// Read block size
+			_, err := io.ReadFull(r, tmp[:4])
+			if err != nil {
+				if err == io.EOF {
+					break
+				}
+				return err
+			}
+			compressed := true
+			blockSize := int(binary.LittleEndian.Uint32(tmp[:4]))
+			if blockSize == 0 {
+				if blockCrc {
+					_, err = io.ReadFull(r, tmp[:4])
+					if err != nil {
+						if err == io.EOF {
+							return io.ErrUnexpectedEOF
+						}
+						return err
+					}
+				}
+				break
+			}
+			if blockSize>>31 != 0 {
+				compressed = false
+				blockSize &= (1 << 31) - 1
+			}
+			if blockSize > maxBlockSize {
+				return fmt.Errorf("minlz: block size too large: %d", blockSize)
+			}
+			_, err = io.ReadFull(r, block[:blockSize])
+			if err != nil {
+				return err
+			}
+			// Read checksum (ignored)
+			if blockCrc {
+				_, err = io.ReadFull(r, tmp[:4])
+				if err != nil {
+					if err == io.EOF {
+						return io.ErrUnexpectedEOF
+					}
+					return err
+				}
+			}
+			if !compressed {
+				var obuf [8]byte
+				uncompressed := block[:blockSize]
+				// Set to uncompressed.
+				chunkType := uint8(chunkTypeUncompressedData)
+				chunkLen := 4 + len(uncompressed)
+
+				// Write as uncompressed.
+				checksum := crc(uncompressed)
+				obuf[0] = chunkType
+				obuf[1] = uint8(chunkLen >> 0)
+				obuf[2] = uint8(chunkLen >> 8)
+				obuf[3] = uint8(chunkLen >> 16)
+				obuf[4] = uint8(checksum >> 0)
+				obuf[5] = uint8(checksum >> 8)
+				obuf[6] = uint8(checksum >> 16)
+				obuf[7] = uint8(checksum >> 24)
+				_, err = w.Write(obuf[:8])
+				if err != nil {
+					return err
+				}
+				_, err = w.Write(uncompressed)
+				if err != nil {
+					return err
+				}
+				uncompSize += len(uncompressed)
+				continue
+			}
+			// Convert block
+			out, sz, err := l.ConvertBlock(dst[:0], block[:blockSize])
+			if err != nil {
+				return err
+			}
+			out = out[3:]
+			if debug {
+				fmt.Println(blockSize, "=>", len(out), "uncompressed:", sz, "ratio:", 100*float64(len(out))/float64(blockSize))
+			}
+			var obuf [8]byte
+			chunkType := uint8(chunkTypeMinLZCompressedDataCompCRC)
+			chunkLen := 4 + len(out)
+
+			// Write block.
+			checksum := crc(out)
+			obuf[0] = chunkType
+			obuf[1] = uint8(chunkLen >> 0)
+			obuf[2] = uint8(chunkLen >> 8)
+			obuf[3] = uint8(chunkLen >> 16)
+			obuf[4] = uint8(checksum >> 0)
+			obuf[5] = uint8(checksum >> 8)
+			obuf[6] = uint8(checksum >> 16)
+			obuf[7] = uint8(checksum >> 24)
+			_, err = w.Write(obuf[:8])
+			if err != nil {
+				return err
+			}
+			_, err = w.Write(out)
+			uncompSize += sz
+		}
+		if contentCrc {
+			// Read content crc (ignored)
+			_, err = io.ReadFull(r, tmp[:4])
+			if err != nil {
+				if err == io.EOF {
+					return io.ErrUnexpectedEOF
+				}
+				return err
+			}
+		}
+		var tmp [4 + binary.MaxVarintLen64]byte
+		tmp[0] = chunkTypeEOF
+		// Write uncompressed size.
+		n = binary.PutUvarint(tmp[4:], uint64(uncompSize))
+		tmp[1] = uint8(n)
+		n += 4
+		_, err = w.Write(tmp[:n])
+		if err != nil {
+			return err
+		}
+	}
+}
diff --git a/vendor/github.com/minio/minlz/minlz.go b/vendor/github.com/minio/minlz/minlz.go
new file mode 100644
index 0000000000..2620d6a0f2
--- /dev/null
+++ b/vendor/github.com/minio/minlz/minlz.go
@@ -0,0 +1,142 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"bytes"
+	"hash/crc32"
+)
+
+const (
+	// MaxBlockSize is the maximum value where MaxEncodedLen will return a valid block size.
+	MaxBlockSize = 8 << 20
+
+	// MinUserSkippableChunk is the lowest user defined skippable chunk ID.
+	// All chunks IDs within this range will be ignored if not handled.
+	MinUserSkippableChunk = 0x80
+
+	// MaxUserSkippableChunk is the last user defined skippable chunk ID.
+	MaxUserSkippableChunk = 0xbf
+
+	// MinUserNonSkippableChunk is the lowest user defined non-skippable chunk ID.
+	// All chunks IDs within this range will cause an error if not handled.
+	MinUserNonSkippableChunk = 0xc0
+
+	// MaxUserNonSkippableChunk is the last user defined non-skippable chunk ID.
+	MaxUserNonSkippableChunk = 0xfd
+
+	// ChunkTypePadding is a padding chunk.
+	ChunkTypePadding = 0xfe
+
+	// ChunkTypeStreamIdentifier is the Snappy/S2/MinLZ stream id chunk.
+	ChunkTypeStreamIdentifier = 0xff
+
+	// MaxUserChunkSize is the maximum possible size of a single chunk.
+	MaxUserChunkSize = 1<<24 - 1 // 16777215
+)
+
+// debugging constants that will enable debug printing and extra checks.
+const (
+	debugValidateBlocks = false
+
+	// Enable debug output for encoding.
+	debugEncode = false
+
+	// Enable debug output for Go decoding.
+	debugDecode = false
+)
+
+/*
+Each encoded block begins with the varint-encoded length of the decoded data,
+followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
+first byte of each chunk is broken into its 2 least and 6 most significant bits
+called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
+Zero means a literal tag. All other values mean a copy tag.
+
+For literal tags:
+  - If m < 60, the next 1 + m bytes are literal bytes.
+  - Otherwise, let n be the little-endian unsigned integer denoted by the next
+    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
+*/
+const (
+	tagLiteral    = 0x00
+	tagRepeat     = 0x00 | (1 << 2)
+	tagCopy1      = 0x01
+	tagCopy2      = 0x02
+	tagCopy3      = 0x03 | 4
+	tagCopy2Fused = 0x03
+)
+
+const (
+	checksumSize     = 4
+	chunkHeaderSize  = 4
+	magicChunk       = "\xff\x06\x00\x00" + magicBody
+	magicBodySnappy  = "sNaPpY"
+	magicBodyS2      = "S2sTwO"
+	magicBody        = "MinLz"
+	magicBodyLen     = len(magicBody) + 1
+	magicChunkS2     = "\xff\x06\x00\x00" + magicBodyS2
+	magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy
+	maxBlockLog      = 23
+
+	// maxBlockSize is the maximum size of the input to encodeBlock.
+	//
+	// For the framing format (Writer type instead of Encode function),
+	// this is the maximum uncompressed size of a block.
+	maxBlockSize = 1 << maxBlockLog
+
+	// minBlockSize is the minimum size of block setting when creating a writer.
+	minBlockSize = 4 << 10
+
+	skippableFrameHeader = 4
+
+	// Default block size
+	defaultBlockSize = 2 << 20
+
+	// maxSnappyBlockSize is the maximum snappy block size in streams.
+	maxSnappyBlockSize = 1 << 16
+
+	// maxS2BlockSize is the maximum s2 block size in streams.
+	maxS2BlockSize = 4 << 20
+
+	obufHeaderLen = checksumSize + chunkHeaderSize
+)
+
+// Internal chunk ids
+const (
+	chunkTypeLegacyCompressedData       = 0x00
+	chunkTypeUncompressedData           = 0x01
+	chunkTypeMinLZCompressedData        = 0x02
+	chunkTypeMinLZCompressedDataCompCRC = 0x03
+	chunkTypeEOF                        = 0x20
+	maxNonSkippableChunk                = 0x3f
+	chunkTypeIndex                      = 0x40 // chunk id of MinLZ index
+	legacyIndexChunk                    = 0x99 // S2 index chunk id (now in user-skippable range)
+)
+
+var crcTable = crc32.MakeTable(crc32.Castagnoli)
+
+// crc implements the checksum specified in section 3 of
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func crc(b []byte) uint32 {
+	c := crc32.Update(0, crcTable, b)
+	return c>>15 | c<<17 + 0xa282ead8
+}
+
+type byter interface {
+	Bytes() []byte
+}
+
+var _ byter = &bytes.Buffer{}
diff --git a/vendor/github.com/minio/minlz/reader.go b/vendor/github.com/minio/minlz/reader.go
new file mode 100644
index 0000000000..c895b2de2d
--- /dev/null
+++ b/vendor/github.com/minio/minlz/reader.go
@@ -0,0 +1,1526 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+	"runtime"
+	"sync"
+
+	"github.com/klauspost/compress/s2"
+)
+
+// ErrCantSeek is returned if the stream cannot be seeked.
+type ErrCantSeek struct {
+	Reason string
+}
+
+// Error returns the error as string.
+func (e ErrCantSeek) Error() string {
+	return fmt.Sprintf("minlz: Can't seek because %s", e.Reason)
+}
+
+// NewReader returns a new Reader that decompresses from r, using the framing
+// format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt with S2 changes.
+func NewReader(r io.Reader, opts ...ReaderOption) *Reader {
+	nr := Reader{
+		r:             r,
+		maxBlock:      maxBlockSize,
+		allowFallback: false,
+	}
+	for _, opt := range opts {
+		if err := opt(&nr); err != nil {
+			nr.err = err
+			return &nr
+		}
+	}
+	nr.maxBufSize = MaxEncodedLen(nr.maxBlock) + checksumSize
+	nr.maxBlockOrg = nr.maxBlock
+	nr.readHeader = nr.ignoreStreamID
+	nr.paramsOK = true
+	return &nr
+}
+
+// ReaderOption is an option for creating a decoder.
+type ReaderOption func(*Reader) error
+
+// ReaderMaxBlockSize allows controlling allocations if the stream
+// has been compressed with a smaller WriterBlockSize, or with the default 1MB.
+// Blocks must be this size or smaller to decompress,
+// otherwise the decoder will return ErrUnsupported.
+//
+// For streams compressed with Snappy this can safely be set to 64KB (64 << 10).
+//
+// Default is the maximum limit of 8MB.
+func ReaderMaxBlockSize(blockSize int) ReaderOption {
+	return func(r *Reader) error {
+		if blockSize > maxBlockSize || blockSize <= minBlockSize {
+			return errors.New("minlz: invalid block size. Must be <= 8MB and >= 4KB")
+		}
+		r.maxBlock = blockSize
+		return nil
+	}
+}
+
+// ReaderIgnoreStreamIdentifier will make the reader skip the expected
+// stream identifier at the beginning of the stream.
+// This can be used when serving a stream that has been forwarded to a specific point.
+// Validation of EOF length is also disabled.
+func ReaderIgnoreStreamIdentifier() ReaderOption {
+	return func(r *Reader) error {
+		r.ignoreStreamID = true
+		return nil
+	}
+}
+
+// ReaderUserChunkCB will register a callback for chunks with the specified ID.
+// ID must be a Reserved skippable chunks ID, 0x40-0xfd (inclusive).
+// For each chunk with the ID, the callback is called with the content.
+// Any returned non-nil error will abort decompression.
+// Only one callback per ID is supported, latest sent will be used.
+// Sending a nil function will disable previous callbacks.
+// You can peek the stream, triggering the callback, by doing a Read with a 0
+// byte buffer.
+func ReaderUserChunkCB(id uint8, fn func(r io.Reader) error) ReaderOption {
+	return func(r *Reader) error {
+		if id < MinUserSkippableChunk || id > MaxUserNonSkippableChunk {
+			return fmt.Errorf("ReaderUserChunkCB: Invalid id provided, must be 0x80-0xfd (inclusive)")
+		}
+		r.skippableCB[id-MinUserSkippableChunk] = fn
+		return nil
+	}
+}
+
+// ReaderIgnoreCRC will make the reader skip CRC calculation and checks.
+func ReaderIgnoreCRC() ReaderOption {
+	return func(r *Reader) error {
+		r.ignoreCRC = true
+		return nil
+	}
+}
+
+// ReaderFallback will enable/disable S2/Snappy fallback.
+func ReaderFallback(b bool) ReaderOption {
+	return func(r *Reader) error {
+		r.allowFallback = b
+		return nil
+	}
+}
+
+// Reader is an io.Reader that can read Snappy-compressed bytes.
+type Reader struct {
+	r           io.Reader
+	err         error
+	decoded     []byte
+	buf         []byte
+	tmp         [16]byte
+	skippableCB [MaxUserNonSkippableChunk - MinUserSkippableChunk + 1]func(r io.Reader) error
+	blockStart  int64 // Uncompressed offset at start of current.
+	index       *Index
+
+	// decoded[i:j] contains decoded bytes that have not yet been passed on.
+	i, j int
+	// maximum block size allowed.
+	maxBlock    int
+	maxBlockOrg int
+	// maximum expected buffer size.
+	maxBufSize     int
+	readHeader     bool
+	paramsOK       bool
+	snappyFrame    bool
+	ignoreStreamID bool
+	ignoreCRC      bool
+	allowFallback  bool
+	wantEOF        bool
+}
+
+// GetBufferCapacity returns the capacity of the internal buffer.
+// This might be useful to know when reusing the same reader in combination
+// with the lazy buffer option.
+func (r *Reader) GetBufferCapacity() int {
+	return cap(r.buf)
+}
+
+// ensureBufferSize will ensure that the buffer can take at least n bytes.
+// If false is returned the buffer exceeds maximum allowed size.
+func (r *Reader) ensureBufferSize(n int) bool {
+	if n > r.maxBufSize {
+		r.err = ErrCorrupt
+		return false
+	}
+	if cap(r.buf) >= n {
+		return true
+	}
+	// Realloc buffer.
+	r.buf = make([]byte, n, n)
+	return true
+}
+
+// Reset discards any buffered data, resets all state, and switches the Snappy
+// reader to read from r. This permits reusing a Reader rather than allocating
+// a new one.
+func (r *Reader) Reset(reader io.Reader) {
+	if !r.paramsOK {
+		return
+	}
+	r.index = nil
+	r.r = reader
+	r.err = nil
+	r.i = 0
+	r.j = 0
+	r.blockStart = 0
+	r.readHeader = r.ignoreStreamID
+	r.wantEOF = false
+	r.snappyFrame = false
+	r.maxBlock = r.maxBlockOrg
+	r.maxBufSize = MaxEncodedLen(r.maxBlock) + checksumSize
+}
+
+func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
+	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
+		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrCorrupt
+		}
+		return false
+	}
+	return true
+}
+
+// skippable will skip n bytes.
+// tmp is used as a temporary buffer for reading.
+// The supplied slice does not need to be the size of the read.
+func (r *Reader) skippable(tmp []byte, n int, allowEOF bool, id uint8) (ok bool) {
+	if len(tmp) < 4096 {
+		tmp = make([]byte, 4096)
+	}
+	if id <= maxNonSkippableChunk {
+		r.err = fmt.Errorf("internal error: skippable id >= 0x40")
+		return false
+	}
+	if id >= MinUserSkippableChunk && id <= MaxUserNonSkippableChunk {
+		if fn := r.skippableCB[id-MinUserSkippableChunk]; fn != nil {
+			rd := io.LimitReader(r.r, int64(n))
+			r.err = fn(rd)
+			if r.err != nil {
+				return false
+			}
+			_, r.err = io.CopyBuffer(io.Discard, rd, tmp)
+			return r.err == nil
+		} else if id >= MinUserNonSkippableChunk && id <= MaxUserNonSkippableChunk {
+			r.err = errors.New("un-skippable user chunk found")
+			return false
+		}
+	}
+	// Read and discard.
+	for n > 0 {
+		if n < len(tmp) {
+			tmp = tmp[:n]
+		}
+		if _, r.err = io.ReadFull(r.r, tmp); r.err != nil {
+			if errors.Is(r.err, io.ErrUnexpectedEOF) || (r.err == io.EOF && !allowEOF) {
+				r.err = ErrCorrupt
+			}
+			return false
+		}
+		n -= len(tmp)
+	}
+	return true
+}
+
+// Read satisfies the io.Reader interface.
+func (r *Reader) Read(p []byte) (int, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	const debug = false
+	for {
+		if r.i < r.j {
+			n := copy(p, r.decoded[r.i:r.j])
+			r.i += n
+			return n, nil
+		}
+		if !r.readFull(r.tmp[:4], !r.wantEOF) {
+			if debug {
+				if r.err != io.EOF {
+					fmt.Println("Readfull failed", r.err)
+				}
+			}
+			return 0, r.err
+		}
+		chunkType := r.tmp[0]
+		chunkLen := int(r.tmp[1]) | int(r.tmp[2])<<8 | int(r.tmp[3])<<16
+		if debug {
+			fmt.Printf("chunkType: 0x%x, chunkLen: %d\n", chunkType, chunkLen)
+		}
+
+		if !r.readHeader {
+			if chunkType == ChunkTypeStreamIdentifier {
+				r.readHeader = true
+			} else if chunkType <= maxNonSkippableChunk && chunkType != chunkTypeEOF {
+				if debug {
+					fmt.Println("ERR: Header not found, got chunk", chunkType)
+				}
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+		}
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeMinLZCompressedData, chunkTypeMinLZCompressedDataCompCRC:
+			r.blockStart += int64(r.j)
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				if debug {
+					fmt.Println("ERR: Read chunk too short, want checksum", chunkLen)
+				}
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrTooLarge
+				}
+				return 0, r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, hdrLen, err := decodedLen(buf)
+			if err != nil {
+				if debug {
+					fmt.Println("ERR: decodedLen:", err)
+				}
+				r.err = err
+				return 0, r.err
+			}
+
+			if n > r.maxBlock {
+				r.err = ErrTooLarge
+				return 0, r.err
+			}
+			if n > len(r.decoded) {
+				r.decoded = make([]byte, n)
+			}
+			buf = buf[hdrLen:]
+			if n == 0 || n < len(buf) {
+				if debug {
+					fmt.Println("ERR: Invalid decompressed length:", n, "buf length:", len(buf))
+				}
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if ret := minLZDecode(r.decoded[:n], buf); ret != 0 {
+				if debug {
+					fmt.Println("ERR: Decoder returned error code:", ret)
+				}
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			toCRC := r.decoded[:n]
+			if chunkType == chunkTypeMinLZCompressedDataCompCRC {
+				toCRC = buf
+			}
+			if !r.ignoreCRC && crc(toCRC) != checksum {
+				if debug {
+					fmt.Println("ERR: CRC mismatch")
+				}
+				r.err = ErrCRC
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeLegacyCompressedData:
+			if !r.allowFallback {
+				if debug {
+					fmt.Println("ERR: Legacy compressed data not allowed")
+				}
+				r.err = ErrUnsupported
+				return 0, r.err
+			}
+			r.blockStart += int64(r.j)
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrTooLarge
+				}
+				return 0, r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if r.snappyFrame && n > maxSnappyBlockSize || n > maxS2BlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+
+			if n > r.maxBlock {
+				r.err = ErrTooLarge
+				return 0, r.err
+			}
+			if n > len(r.decoded) {
+				r.decoded = make([]byte, n)
+			}
+			if _, err := s2.Decode(r.decoded, buf); err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {
+				r.err = ErrCRC
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeUncompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				if debug {
+					fmt.Println("chunkLen < checksumSize", r.err)
+				}
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrTooLarge
+				}
+				return 0, r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				if debug {
+					fmt.Println("Readfull failed", r.err)
+				}
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n := chunkLen - checksumSize
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				if debug {
+					fmt.Println("ERR: Snappy block too big")
+				}
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if n > r.maxBlock {
+				r.err = ErrTooLarge
+				return 0, r.err
+			}
+			if n > len(r.decoded) {
+				r.decoded = make([]byte, n)
+			}
+			if !r.readFull(r.decoded[:n], false) {
+				if debug {
+					fmt.Println("Readfull2 failed", r.err)
+				}
+				return 0, r.err
+			}
+			if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {
+				r.err = ErrCRC
+				return 0, r.err
+			}
+
+			r.i, r.j = 0, n
+			continue
+		case chunkTypeEOF:
+			if debug {
+				fmt.Println("EOF chunk", chunkLen)
+			}
+			if chunkLen > binary.MaxVarintLen64 {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if chunkLen != 0 {
+				buf := r.tmp[:chunkLen]
+				if !r.readFull(buf, false) {
+					return 0, r.err
+				}
+				if !r.ignoreStreamID {
+					wantSize, n := binary.Uvarint(buf[:chunkLen])
+					if n != chunkLen {
+						if debug {
+							fmt.Println("ERR: EOF chunk length mismatch", n, chunkLen)
+						}
+						r.err = ErrCorrupt
+						return 0, r.err
+					}
+					if wantSize != uint64(r.blockStart+int64(r.j)) {
+						if debug {
+							fmt.Println("ERR: EOF data length mismatch", wantSize, r.blockStart+int64(r.j))
+						}
+						r.err = ErrCorrupt
+						return 0, r.err
+					}
+					if debug {
+						fmt.Println("EOF length verified", wantSize, "==", r.blockStart+int64(r.j), r.blockStart, r.j)
+					}
+				}
+			}
+			r.wantEOF = false
+			r.readHeader = false
+			continue
+		case ChunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != magicBodyLen {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.tmp[:magicBodyLen], false) {
+				return 0, r.err
+			}
+			r.blockStart = 0
+			r.i, r.j = 0, 0
+			if string(r.tmp[:len(magicBody)]) == magicBody {
+				if !r.minLzHeader(r.tmp[:magicBodyLen]) {
+					return 0, r.err
+				}
+				continue
+			}
+
+			if !r.allowFallback {
+				r.err = ErrUnsupported
+				return 0, r.err
+			}
+			r.maxBlock = r.maxBlockOrg
+			if string(r.tmp[:magicBodyLen]) != magicBodyS2 && string(r.tmp[:magicBodyLen]) != magicBodySnappy {
+				r.err = ErrUnsupported
+				return 0, r.err
+			}
+			r.snappyFrame = string(r.tmp[:magicBodyLen]) == magicBodySnappy
+			continue
+		}
+
+		if chunkType <= maxNonSkippableChunk {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			// fmt.Printf("ERR chunktype: 0x%x\n", chunkType)
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+
+		// Handle skippable chunks
+		if !r.skippable(r.buf, chunkLen, false, chunkType) {
+			return 0, r.err
+		}
+	}
+}
+
+// WriteTo writes data to w until there's no more data to write or
+// when an error occurs. The return value n is the number of bytes
+// written. Any error encountered during the write is also returned.
+func (r *Reader) WriteTo(w io.Writer) (n int64, err error) {
+	if r.i > 0 || r.j > 0 {
+		if r.i != r.j {
+			missing := r.decoded[r.i:r.j]
+			n2, err := w.Write(missing)
+			if err == nil && n2 != len(missing) {
+				err = io.ErrShortWrite
+			}
+			n += int64(n2)
+			if err != nil {
+				r.err = err
+				return n, r.err
+			}
+		}
+		r.blockStart += int64(r.j)
+		r.i, r.j = 0, 0
+	}
+	n2, err := r.DecodeConcurrent(w, runtime.NumCPU())
+	return n + n2, err
+}
+
+// DecodeConcurrent will decode the full stream to w.
+// This function should not be combined with reading, seeking or other operations.
+// Up to 'concurrent' goroutines will be used.
+// If <= 0, min(runtime.NumCPU, runtime.GOMAXPROCS, 8) will be used.
+// On success the number of bytes decompressed nil and is returned.
+// This is mainly intended for bigger streams, since it will cause more allocations.
+func (r *Reader) DecodeConcurrent(w io.Writer, concurrent int) (written int64, err error) {
+	if r.i > 0 || r.j > 0 {
+		return 0, errors.New("DecodeConcurrent called after Read")
+	}
+	if concurrent <= 0 {
+		concurrent = min(runtime.NumCPU(), runtime.GOMAXPROCS(0), 8)
+	}
+	if concurrent == 1 {
+		if rf, ok := w.(io.ReaderFrom); ok {
+			return rf.ReadFrom(r)
+		}
+		buf := make([]byte, 128<<10)
+		return io.CopyBuffer(w, r, buf)
+	}
+
+	const debug = false
+	// Write to output
+	var errMu sync.Mutex
+	var aErr error
+	setErr := func(e error) (ok bool) {
+		errMu.Lock()
+		defer errMu.Unlock()
+		if e == nil {
+			return aErr == nil
+		}
+		if aErr == nil {
+			aErr = e
+		}
+		return false
+	}
+	hasErr := func() (ok bool) {
+		errMu.Lock()
+		v := aErr != nil
+		errMu.Unlock()
+		return v
+	}
+
+	var aWritten int64
+	toRead := make(chan []byte, concurrent+1)
+	writtenBlocks := make(chan []byte, concurrent+1)
+	queue := make(chan chan io.Writer, concurrent)
+	reUse := make(chan chan io.Writer, concurrent)
+	for i := 0; i < concurrent; i++ {
+		toRead <- nil // We do not know max block size yet, so don't alloc yet
+		writtenBlocks <- nil
+		reUse <- make(chan io.Writer, 1)
+	}
+	// Add extra in+out block, so we can read ahead by one.
+	toRead <- nil
+	writtenBlocks <- nil
+
+	// Writer.
+	// We let the goroutine that did the decompression do the writing.
+	// We are more likely that decompressed data will be in local cache.
+	var wg sync.WaitGroup
+	wg.Add(1)
+	writeBuf := func(buf []byte, entry chan io.Writer) {
+		// Wait until our turn
+		w := <-entry
+		defer func() {
+			if buf != nil {
+				writtenBlocks <- buf
+			}
+			reUse <- entry
+
+			// Take next top entry from queue.
+			next, ok := <-queue
+			if !ok {
+				wg.Done()
+				return
+			}
+			// Forward writer
+			next <- w
+		}()
+		n, err := w.Write(buf)
+		if err != nil {
+			setErr(err)
+			return
+		}
+		want := len(buf)
+		if n != want {
+			setErr(io.ErrShortWrite)
+			return
+		}
+		aWritten += int64(n)
+	}
+
+	// Seed writer
+	seed := <-reUse
+	go writeBuf(nil, seed)
+	seed <- w
+
+	// Cleanup
+	defer func() {
+		if r.err != nil {
+			setErr(r.err)
+		} else if err != nil {
+			setErr(err)
+		}
+		close(queue)
+		wg.Wait()
+		if err == nil {
+			err = aErr
+		}
+		written = aWritten
+	}()
+
+	// Reader
+	for !hasErr() {
+		if !r.readFull(r.tmp[:4], !r.wantEOF) {
+			if r.err == io.EOF {
+				r.err = nil
+			}
+			return 0, r.err
+		}
+		chunkType := r.tmp[0]
+		chunkLen := int(r.tmp[1]) | int(r.tmp[2])<<8 | int(r.tmp[3])<<16
+		if !r.readHeader {
+			if chunkType == ChunkTypeStreamIdentifier {
+				r.readHeader = true
+			} else if chunkType <= maxNonSkippableChunk && chunkType != chunkTypeEOF {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+		}
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeLegacyCompressedData:
+			if !r.allowFallback {
+				if debug {
+					fmt.Println("ERR: Legacy compressed data not allowed")
+				}
+				r.err = ErrUnsupported
+				return 0, r.err
+			}
+			r.blockStart += int64(r.j)
+			r.j = 0
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if chunkLen > r.maxBufSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			orgBuf := <-toRead
+			if cap(orgBuf) < chunkLen {
+				orgBuf = make([]byte, r.maxBufSize)
+			}
+			buf := orgBuf[:chunkLen]
+
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+
+			if n > r.maxBlock {
+				r.err = ErrTooLarge
+				return 0, r.err
+			}
+			wg.Add(1)
+
+			decoded := <-writtenBlocks
+			if cap(decoded) < n {
+				decoded = make([]byte, r.maxBlock)
+			}
+			entry := <-reUse
+			queue <- entry
+			r.blockStart += int64(r.j)
+			go func() {
+				defer wg.Done()
+				decoded = decoded[:n]
+				_, err := s2.Decode(decoded, buf)
+				toRead <- orgBuf
+				if err != nil {
+					writtenBlocks <- decoded
+					setErr(err)
+					writeBuf(nil, entry)
+					return
+				}
+				if !r.ignoreCRC && crc(decoded) != checksum {
+					writtenBlocks <- decoded
+					setErr(ErrCRC)
+					writeBuf(nil, entry)
+					return
+				}
+				writeBuf(decoded, entry)
+			}()
+			continue
+		case chunkTypeMinLZCompressedData, chunkTypeMinLZCompressedDataCompCRC:
+			r.blockStart += int64(r.j)
+			r.j = 0
+
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if chunkLen > r.maxBufSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			orgBuf := <-toRead
+			if cap(orgBuf) < chunkLen {
+				orgBuf = make([]byte, r.maxBufSize)
+			}
+			buf := orgBuf[:chunkLen]
+
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, hdrSize, err := decodedLen(buf)
+			if err != nil {
+				r.err = err
+				return 0, r.err
+			}
+
+			if n > r.maxBlock {
+				r.err = ErrTooLarge
+				return 0, r.err
+			}
+			r.blockStart += int64(n)
+			buf = buf[hdrSize:]
+			if n == 0 || n < len(buf) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+
+			wg.Add(1)
+
+			decoded := <-writtenBlocks
+			if cap(decoded) < n {
+				decoded = make([]byte, r.maxBlock)
+			}
+			entry := <-reUse
+			queue <- entry
+			go func() {
+				defer wg.Done()
+				decoded = decoded[:n]
+				ret := minLZDecode(decoded, buf)
+
+				toRead <- orgBuf
+				if ret != 0 {
+					if debug {
+						fmt.Println("ERR: Decoder returned error code:", ret)
+					}
+					writtenBlocks <- decoded
+					setErr(ErrCorrupt)
+					writeBuf(nil, entry)
+					return
+				}
+				toCRC := decoded
+				if chunkType == chunkTypeMinLZCompressedDataCompCRC {
+					toCRC = buf
+				}
+				if !r.ignoreCRC && crc(toCRC) != checksum {
+					if debug {
+						fmt.Println("ERR: CRC mismatch", crc(decoded), checksum)
+					}
+					writtenBlocks <- decoded
+					setErr(ErrCRC)
+					writeBuf(nil, entry)
+					return
+				}
+				writeBuf(decoded, entry)
+			}()
+			continue
+		case chunkTypeUncompressedData:
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if chunkLen > r.maxBufSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.blockStart += int64(r.j)
+			r.j = 0
+			// Grab write buffer
+			orgBuf := <-writtenBlocks
+			if cap(orgBuf) < chunkLen {
+				orgBuf = make([]byte, r.maxBufSize)
+			}
+			buf := orgBuf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read content.
+			n := chunkLen - checksumSize
+			r.blockStart += int64(n)
+
+			if r.snappyFrame && n > maxSnappyBlockSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if n > r.maxBlock {
+				r.err = ErrTooLarge
+				return 0, r.err
+			}
+
+			// Read uncompressed
+			buf = orgBuf[:n]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+
+			if !r.ignoreCRC && crc(buf) != checksum {
+				r.err = ErrCRC
+				return 0, r.err
+			}
+			entry := <-reUse
+			queue <- entry
+			go writeBuf(buf, entry)
+			continue
+		case chunkTypeEOF:
+			if chunkLen != 0 {
+				if chunkLen > binary.MaxVarintLen64 {
+					r.err = ErrCorrupt
+					return 0, r.err
+				}
+
+				buf := r.tmp[:chunkLen]
+				if !r.readFull(buf, false) {
+					return 0, r.err
+				}
+				if !r.ignoreStreamID {
+					wantSize, n := binary.Uvarint(buf[:chunkLen])
+					if n != chunkLen {
+						if debug {
+							fmt.Println("ERR: EOF chunk length mismatch", n, chunkLen)
+						}
+						r.err = ErrCorrupt
+						return 0, r.err
+					}
+					if wantSize != uint64(r.blockStart+int64(r.j)) {
+						if debug {
+							fmt.Println("ERR: EOF data length mismatch", wantSize, r.blockStart+int64(r.j))
+						}
+						r.err = ErrCorrupt
+						return 0, r.err
+					}
+				}
+			}
+			r.wantEOF = false
+			r.readHeader = false
+			continue
+		case ChunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != magicBodyLen {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.tmp[:magicBodyLen], false) {
+				return 0, r.err
+			}
+			r.blockStart = 0
+			r.i, r.j = 0, 0
+			if string(r.tmp[:len(magicBody)]) == magicBody {
+				if !r.minLzHeader(r.tmp[:magicBodyLen]) {
+					return 0, r.err
+				}
+				continue
+			}
+			if !r.allowFallback {
+				if debug {
+					fmt.Println("!fallback")
+				}
+				r.err = ErrUnsupported
+				return 0, r.err
+			}
+			r.maxBlock = r.maxBlockOrg
+
+			if string(r.tmp[:magicBodyLen]) != magicBodyS2 && string(r.tmp[:magicBodyLen]) != magicBodySnappy {
+				r.err = ErrUnsupported
+				return 0, r.err
+			}
+			r.snappyFrame = string(r.tmp[:magicBodyLen]) == magicBodySnappy
+			continue
+		}
+
+		if chunkType <= maxNonSkippableChunk {
+			if debug {
+				fmt.Printf("ERR chunktype: 0x%x\n", chunkType)
+			}
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x04-0x3f).
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x40-0xfd).
+		if !r.skippable(r.buf, chunkLen, false, chunkType) {
+			return 0, r.err
+		}
+	}
+	return 0, r.err
+}
+
+func (r *Reader) minLzHeader(hdr []byte) (ok bool) {
+	if len(hdr) < magicBodyLen {
+		r.err = ErrCorrupt
+		return false
+	}
+	// Upper 2 bits most be 0
+	if hdr[magicBodyLen-1]&(3<<6) != 0 {
+		r.err = ErrCorrupt
+		return false
+	}
+	n := hdr[magicBodyLen-1]&15 + 10
+	if n > maxBlockLog {
+		r.err = ErrCorrupt
+		return false
+	}
+	r.maxBlock = 1 << n
+	r.maxBufSize = MaxEncodedLen(r.maxBlock) + checksumSize
+	if r.maxBlock > r.maxBlockOrg {
+		r.err = ErrTooLarge
+		return false
+	}
+	if !r.ensureBufferSize(MaxEncodedLen(r.maxBlock) + checksumSize) {
+		if r.err == nil {
+			r.err = ErrTooLarge
+		}
+		return false
+	}
+	if len(r.decoded) < r.maxBlock {
+		r.decoded = make([]byte, 0, n)
+	}
+	r.snappyFrame = false
+	r.wantEOF = true
+	return true
+}
+
+// Skip will skip n bytes forward in the decompressed output.
+// For larger skips this consumes less CPU and is faster than reading output and discarding it.
+// CRC is not checked on skipped blocks.
+// io.ErrUnexpectedEOF is returned if the stream ends before all bytes have been skipped.
+// If a decoding error is encountered subsequent calls to Read will also fail.
+func (r *Reader) Skip(n int64) error {
+	if n < 0 {
+		return errors.New("attempted negative skip")
+	}
+	if r.err != nil {
+		return r.err
+	}
+
+	for n > 0 {
+		if r.i < r.j {
+			// Skip in buffer.
+			// decoded[i:j] contains decoded bytes that have not yet been passed on.
+			left := int64(r.j - r.i)
+			if left >= n {
+				tmp := int64(r.i) + n
+				if tmp > math.MaxInt32 {
+					return errors.New("minlz: internal overflow in skip")
+				}
+				r.i = int(tmp)
+				return nil
+			}
+			n -= int64(r.j - r.i)
+			r.i = r.j
+		}
+
+		// Buffer empty; read blocks until we have content.
+		if !r.readFull(r.tmp[:4], !r.wantEOF) {
+			if r.err == io.EOF {
+				r.err = io.ErrUnexpectedEOF
+			}
+			return r.err
+		}
+		chunkType := r.tmp[0]
+		if !r.readHeader {
+			if chunkType == ChunkTypeStreamIdentifier {
+				r.readHeader = true
+			} else if chunkType <= maxNonSkippableChunk && chunkType != chunkTypeEOF {
+				r.err = ErrCorrupt
+				return r.err
+			}
+		}
+
+		chunkLen := int(r.tmp[1]) | int(r.tmp[2])<<8 | int(r.tmp[3])<<16
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeMinLZCompressedData, chunkTypeMinLZCompressedDataCompCRC:
+			r.blockStart += int64(r.j)
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrTooLarge
+				}
+				return r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			dLen, hdrSize, err := decodedLen(buf)
+			if err != nil {
+				r.err = err
+				return r.err
+			}
+			if dLen > r.maxBlock {
+				r.err = ErrTooLarge
+				return r.err
+			}
+			if dLen == 0 || dLen < len(buf)-hdrSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			// Check if destination is within this block
+			if int64(dLen) > n {
+				if len(r.decoded) < dLen {
+					r.decoded = make([]byte, dLen)
+				}
+				buf = buf[hdrSize:]
+				if ret := minLZDecode(r.decoded[:dLen], buf); ret != 0 {
+					r.err = ErrTooLarge
+					return r.err
+				}
+				toCRC := r.decoded[:dLen]
+				if chunkType == chunkTypeMinLZCompressedDataCompCRC {
+					toCRC = buf
+				}
+				if !r.ignoreCRC && crc(toCRC) != checksum {
+					r.err = ErrCRC
+					return r.err
+				}
+			} else {
+				// Skip block completely
+				n -= int64(dLen)
+				r.blockStart += int64(dLen)
+				dLen = 0
+			}
+			r.i, r.j = 0, dLen
+			continue
+		case chunkTypeLegacyCompressedData:
+			if !r.allowFallback {
+				r.err = ErrUnsupported
+				return r.err
+			}
+
+			r.blockStart += int64(r.j)
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err == nil {
+					r.err = ErrTooLarge
+				}
+				return r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			dLen, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return r.err
+			}
+			if dLen > r.maxBlock {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			// Check if destination is within this block
+			if int64(dLen) > n {
+				if len(r.decoded) < dLen {
+					r.decoded = make([]byte, dLen)
+				}
+				if _, err := s2.Decode(r.decoded, buf); err != nil {
+					r.err = err
+					return r.err
+				}
+				if crc(r.decoded[:dLen]) != checksum {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			} else {
+				// Skip block completely
+				n -= int64(dLen)
+				r.blockStart += int64(dLen)
+				dLen = 0
+			}
+			r.i, r.j = 0, dLen
+			continue
+		case chunkTypeUncompressedData:
+			r.blockStart += int64(r.j)
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.ensureBufferSize(chunkLen) {
+				if r.err != nil {
+					r.err = ErrTooLarge
+				}
+				return r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n2 := chunkLen - checksumSize
+			if n2 > len(r.decoded) {
+				if n2 > r.maxBlock {
+					r.err = ErrCorrupt
+					return r.err
+				}
+				r.decoded = make([]byte, n2)
+			}
+			if !r.readFull(r.decoded[:n2], false) {
+				return r.err
+			}
+			if int64(n2) < n {
+				if crc(r.decoded[:n2]) != checksum {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			}
+			r.i, r.j = 0, n2
+			continue
+		case chunkTypeEOF:
+			if chunkLen != 0 {
+				if chunkLen > binary.MaxVarintLen64 {
+					r.err = ErrCorrupt
+					return r.err
+				}
+
+				buf := r.tmp[:chunkLen]
+				if !r.readFull(buf, false) {
+					return r.err
+				}
+				if !r.ignoreStreamID {
+					wantSize, n := binary.Uvarint(buf[:chunkLen])
+					if n != chunkLen {
+						r.err = ErrCorrupt
+						return r.err
+					}
+					if wantSize != uint64(r.blockStart+int64(r.j)) {
+						r.err = ErrCorrupt
+						return r.err
+					}
+				}
+			}
+			r.wantEOF = false
+			r.readHeader = false
+			continue
+		case ChunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != magicBodyLen {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.readFull(r.tmp[:magicBodyLen], false) {
+				return r.err
+			}
+			r.blockStart = 0
+			r.i, r.j = 0, 0
+			if string(r.tmp[:len(magicBody)]) == magicBody {
+				if !r.minLzHeader(r.tmp[:magicBodyLen]) {
+					return r.err
+				}
+				continue
+			}
+			if !r.allowFallback {
+				r.err = ErrUnsupported
+				return r.err
+			}
+			r.maxBlock = r.maxBlockOrg
+			if string(r.tmp[:magicBodyLen]) != magicBodyS2 && string(r.tmp[:magicBodyLen]) != magicBodySnappy {
+				r.err = ErrUnsupported
+				return r.err
+			}
+			r.snappyFrame = string(r.tmp[:magicBodyLen]) == magicBodySnappy
+
+			continue
+		}
+
+		if chunkType <= maxNonSkippableChunk {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			r.err = ErrUnsupported
+			return r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if !r.skippable(r.buf, chunkLen, false, chunkType) {
+			return r.err
+		}
+	}
+	return nil
+}
+
+// ReadSeeker provides random or forward seeking in compressed content.
+// See Reader.ReadSeeker
+type ReadSeeker struct {
+	*Reader
+	seek     io.Seeker
+	readAtMu sync.Mutex
+}
+
+// ReadSeeker will return an io.ReadSeeker and io.ReaderAt
+// compatible version of the reader.
+// The original input must support the io.Seeker interface.
+// A custom index can be specified which will be used if supplied.
+// When using a custom index, it will not be read from the input stream.
+// The ReadAt position will affect regular reads and the current position of Seek.
+// So using Read after ReadAt will continue from where the ReadAt stopped.
+// No functions should be used concurrently.
+// The returned ReadSeeker contains a shallow reference to the existing Reader,
+// meaning changes performed to one is reflected in the other.
+func (r *Reader) ReadSeeker(index []byte) (*ReadSeeker, error) {
+	// Read index if provided.
+	if len(index) != 0 {
+		if r.index == nil {
+			r.index = &Index{}
+		}
+		if _, err := r.index.Load(index); err != nil {
+			return nil, ErrCantSeek{Reason: "loading index returned: " + err.Error()}
+		}
+	}
+
+	// Check if input is seekable
+	rs, ok := r.r.(io.ReadSeeker)
+	if !ok {
+		return nil, ErrCantSeek{Reason: "input stream isn't seekable"}
+	}
+
+	if r.index != nil {
+		// Seekable and index, ok...
+		return &ReadSeeker{Reader: r, seek: rs}, nil
+	}
+
+	// Load from stream.
+	r.index = &Index{}
+
+	// Read current position.
+	pos, err := rs.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return nil, ErrCantSeek{Reason: "seeking input returned: " + err.Error()}
+	}
+	err = r.index.LoadStream(rs)
+	if err != nil {
+		if err == ErrUnsupported {
+			return nil, ErrCantSeek{Reason: "input stream does not contain an index"}
+		}
+		return nil, ErrCantSeek{Reason: "reading index returned: " + err.Error()}
+	}
+
+	// reset position.
+	_, err = rs.Seek(pos, io.SeekStart)
+	if err != nil {
+		return nil, ErrCantSeek{Reason: "seeking input returned: " + err.Error()}
+	}
+	return &ReadSeeker{Reader: r, seek: rs}, nil
+}
+
+// Seek allows seeking in compressed data.
+func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {
+	if r.err != nil {
+		if !errors.Is(r.err, io.EOF) {
+			return 0, r.err
+		}
+		// Reset on EOF
+		r.err = nil
+	}
+
+	// Calculate absolute offset.
+	absOffset := offset
+
+	switch whence {
+	case io.SeekStart:
+	case io.SeekCurrent:
+		absOffset = r.blockStart + int64(r.i) + offset
+	case io.SeekEnd:
+		if r.index == nil {
+			return 0, ErrUnsupported
+		}
+		absOffset = r.index.TotalUncompressed + offset
+	default:
+		r.err = ErrUnsupported
+		return 0, r.err
+	}
+
+	if absOffset < 0 {
+		return 0, errors.New("seek before start of file")
+	}
+
+	if !r.readHeader {
+		// Make sure we read the header.
+		// Seek to start, since we may be at EOF.
+		_, r.err = r.seek.Seek(0, io.SeekStart)
+		if r.err != nil {
+			return 0, r.err
+		}
+		_, r.err = r.Read([]byte{})
+		if r.err != nil {
+			return 0, r.err
+		}
+	}
+
+	// If we are inside current block no need to seek.
+	// This includes no offset changes.
+	if absOffset >= r.blockStart && absOffset < r.blockStart+int64(r.j) {
+		r.i = int(absOffset - r.blockStart)
+		return r.blockStart + int64(r.i), nil
+	}
+
+	// We can seek and we have an index.
+	c, u, err := r.index.Find(absOffset)
+	if err != nil {
+		return r.blockStart + int64(r.i), err
+	}
+
+	// Seek to next block
+	_, err = r.seek.Seek(c, io.SeekStart)
+	if err != nil {
+		return 0, err
+	}
+
+	r.i = r.j                     // Remove rest of current block.
+	r.blockStart = u - int64(r.j) // Adjust current block start for accounting.
+	if u < absOffset {
+		// Forward inside block
+		return absOffset, r.Skip(absOffset - u)
+	}
+	if u > absOffset {
+		return 0, fmt.Errorf("minlz seek: (internal error) u (%d) > absOffset (%d)", u, absOffset)
+	}
+	return absOffset, nil
+}
+
+// ReadAt reads len(p) bytes into p starting at offset off in the
+// underlying input source. It returns the number of bytes
+// read (0 <= n <= len(p)) and any error encountered.
+//
+// When ReadAt returns n < len(p), it returns a non-nil error
+// explaining why more bytes were not returned. In this respect,
+// ReadAt is stricter than Read.
+//
+// Even if ReadAt returns n < len(p), it may use all of p as scratch
+// space during the call. If some data is available but not len(p) bytes,
+// ReadAt blocks until either all the data is available or an error occurs.
+// In this respect ReadAt is different from Read.
+//
+// If the n = len(p) bytes returned by ReadAt are at the end of the
+// input source, ReadAt may return either err == EOF or err == nil.
+//
+// If ReadAt is reading from an input source with a seek offset,
+// ReadAt should not affect nor be affected by the underlying
+// seek offset.
+//
+// Clients of ReadAt can execute parallel ReadAt calls on the
+// same input source. This is however not recommended.
+func (r *ReadSeeker) ReadAt(p []byte, offset int64) (int, error) {
+	r.readAtMu.Lock()
+	defer r.readAtMu.Unlock()
+	_, err := r.Seek(offset, io.SeekStart)
+	if err != nil {
+		return 0, err
+	}
+	n := 0
+	for n < len(p) {
+		n2, err := r.Read(p[n:])
+		if err != nil {
+			// This will include io.EOF
+			return n + n2, err
+		}
+		n += n2
+	}
+	return n, nil
+}
+
+// Index will return the index used.
+func (r *ReadSeeker) Index() *Index {
+	return r.index
+}
+
+// ReadByte satisfies the io.ByteReader interface.
+func (r *Reader) ReadByte() (byte, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	if r.i < r.j {
+		c := r.decoded[r.i]
+		r.i++
+		return c, nil
+	}
+	var tmp [1]byte
+	for i := 0; i < 10; i++ {
+		n, err := r.Read(tmp[:])
+		if err != nil {
+			return 0, err
+		}
+		if n == 1 {
+			return tmp[0], nil
+		}
+	}
+	return 0, io.ErrNoProgress
+}
+
+// UserChunkCB will register a callback for chunks with the specified ID.
+// ID must be a reserved user chunks ID, 0x80-0xfd (inclusive).
+// For each chunk with the ID, the callback is called with the content.
+// Any returned non-nil error will abort decompression.
+// Only one callback per ID is supported, latest sent will be used.
+// Sending a nil function will disable previous callbacks.
+// You can peek the stream, triggering the callback, by doing a Read with a 0
+// byte buffer.
+func (r *Reader) UserChunkCB(id uint8, fn func(r io.Reader) error) error {
+	if id < MinUserSkippableChunk || id > MaxUserNonSkippableChunk {
+		return fmt.Errorf("ReaderUserChunkCB: Invalid id provided, must be 0x80-0xfe (inclusive)")
+	}
+	r.skippableCB[id-MinUserSkippableChunk] = fn
+	return nil
+}
diff --git a/vendor/github.com/minio/minlz/unsafe_disabled.go b/vendor/github.com/minio/minlz/unsafe_disabled.go
new file mode 100644
index 0000000000..d51f7ddb28
--- /dev/null
+++ b/vendor/github.com/minio/minlz/unsafe_disabled.go
@@ -0,0 +1,49 @@
+//go:build !(amd64 || arm64 || ppc64le || riscv64) || nounsafe || purego || appengine
+
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"encoding/binary"
+)
+
+func load8(b []byte, i int) byte {
+	return b[i]
+}
+
+func load16(b []byte, i int) uint16 {
+	return binary.LittleEndian.Uint16(b[i:])
+}
+
+func load32(b []byte, i int) uint32 {
+	return binary.LittleEndian.Uint32(b[i:])
+}
+
+func load64(b []byte, i int) uint64 {
+	return binary.LittleEndian.Uint64(b[i:])
+}
+
+func store8(b []byte, idx int, v uint8) {
+	b[idx] = v
+}
+
+func store16(b []byte, idx int, v uint16) {
+	binary.LittleEndian.PutUint16(b[idx:], v)
+}
+
+func store32(b []byte, idx int, v uint32) {
+	binary.LittleEndian.PutUint32(b[idx:], v)
+}
diff --git a/vendor/github.com/minio/minlz/unsafe_enabled.go b/vendor/github.com/minio/minlz/unsafe_enabled.go
new file mode 100644
index 0000000000..cf478c9829
--- /dev/null
+++ b/vendor/github.com/minio/minlz/unsafe_enabled.go
@@ -0,0 +1,59 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// We enable 64 bit LE platforms:
+
+//go:build (amd64 || arm64 || ppc64le || riscv64) && !nounsafe && !purego && !appengine
+
+package minlz
+
+import (
+	"unsafe"
+)
+
+func load8(b []byte, i int) byte {
+	return *(*byte)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+func load16(b []byte, i int) uint16 {
+	//return binary.LittleEndian.Uint16(b[i:])
+	//return *(*uint16)(unsafe.Pointer(&b[i]))
+	return *(*uint16)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+func load32(b []byte, i int) uint32 {
+	//return binary.LittleEndian.Uint32(b[i:])
+	//return *(*uint32)(unsafe.Pointer(&b[i]))
+	return *(*uint32)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+func load64(b []byte, i int) uint64 {
+	//return binary.LittleEndian.Uint64(b[i:])
+	//return *(*uint64)(unsafe.Pointer(&b[i]))
+	return *(*uint64)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+func store8(b []byte, idx int, v uint8) {
+	*(*uint8)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), idx)) = v
+}
+
+func store16(b []byte, idx int, v uint16) {
+	//binary.LittleEndian.PutUint16(b, v)
+	*(*uint16)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), idx)) = v
+}
+
+func store32(b []byte, idx int, v uint32) {
+	//binary.LittleEndian.PutUint32(b, v)
+	*(*uint32)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), idx)) = v
+}
diff --git a/vendor/github.com/minio/minlz/writer.go b/vendor/github.com/minio/minlz/writer.go
new file mode 100644
index 0000000000..443299639c
--- /dev/null
+++ b/vendor/github.com/minio/minlz/writer.go
@@ -0,0 +1,1040 @@
+// Copyright 2025 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"bytes"
+	"crypto/rand"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"math/bits"
+	"os"
+	"runtime"
+	"sync"
+)
+
+// NewWriter returns a new Writer that compresses as an MinLZ stream to w.
+//
+// Users must call Close to guarantee all data has been forwarded to
+// the underlying io.Writer and that resources are released.
+func NewWriter(w io.Writer, opts ...WriterOption) *Writer {
+	w2 := Writer{
+		blockSize:   defaultBlockSize,
+		concurrency: runtime.GOMAXPROCS(0),
+		randSrc:     rand.Reader,
+		level:       LevelBalanced,
+		genIndex:    true,
+	}
+	for _, opt := range opts {
+		if err := opt(&w2); err != nil {
+			w2.errState = err
+			return &w2
+		}
+	}
+	w2.obufLen = obufHeaderLen + MaxEncodedLen(w2.blockSize)
+	w2.paramsOK = true
+	w2.ibuf = make([]byte, 0, w2.blockSize)
+	w2.buffers.New = func() interface{} {
+		return make([]byte, w2.obufLen)
+	}
+	w2.Reset(w)
+	return &w2
+}
+
+// Writer is an io.Writer that can write Snappy-compressed bytes.
+type Writer struct {
+	errMu    sync.Mutex
+	errState error
+
+	// ibuf is a buffer for the incoming (uncompressed) bytes.
+	ibuf []byte
+
+	blockSize     int
+	obufLen       int
+	concurrency   int
+	written       int64
+	uncompWritten int64 // Bytes sent to compression
+	output        chan chan result
+	buffers       sync.Pool
+	pad           int
+
+	writer    io.Writer
+	randSrc   io.Reader
+	writerWg  sync.WaitGroup
+	index     *Index
+	customEnc func(dst, src []byte) int
+
+	// wroteStreamHeader is whether we have written the stream header.
+	wroteStreamHeader bool
+	paramsOK          bool
+	flushOnWrite      bool
+	appendIndex       bool
+	genIndex          bool
+	level             uint8
+}
+
+type result struct {
+	b []byte
+	// Uncompressed start offset
+	startOffset int64
+}
+
+var errClosed = errors.New("minlz: Writer is closed")
+var errNilWriter = errors.New("minlz: Writer has not been set")
+
+// err returns the previously set error.
+// If no error has been set it is set to err if not nil.
+func (w *Writer) err(err error) error {
+	w.errMu.Lock()
+	errSet := w.errState
+	if errSet == nil && err != nil {
+		w.errState = err
+		errSet = err
+	}
+	w.errMu.Unlock()
+	return errSet
+}
+
+// Reset discards the writer's state and switches the Snappy writer to write to w.
+// This permits reusing a Writer rather than allocating a new one.
+func (w *Writer) Reset(writer io.Writer) {
+	if !w.paramsOK {
+		return
+	}
+	// Close previous writer, if any.
+	if w.output != nil {
+		close(w.output)
+		w.writerWg.Wait()
+		w.output = nil
+	}
+	if w.genIndex && w.index == nil {
+		w.index = &Index{}
+	}
+	w.errState = nil
+	w.ibuf = w.ibuf[:0]
+	w.wroteStreamHeader = false
+	w.written = 0
+	w.writer = writer
+	w.uncompWritten = 0
+	w.index.reset(w.blockSize)
+
+	// If we didn't get a writer, stop here.
+	if writer == nil {
+		w.err(errNilWriter)
+		return
+	}
+	// If no concurrency requested, don't spin up writer goroutine.
+	if w.concurrency == 1 {
+		return
+	}
+
+	toWrite := make(chan chan result, w.concurrency)
+	w.output = toWrite
+	w.writerWg.Add(1)
+
+	// Start a writer goroutine that will write all output in order.
+	go func() {
+		defer w.writerWg.Done()
+
+		// Get a queued write.
+		for write := range toWrite {
+			// Wait for the data to be available.
+			input := <-write
+			in := input.b
+			if len(in) > 0 {
+				if w.err(nil) == nil {
+					// Don't expose data from previous buffers.
+					toWrite := in[:len(in):len(in)]
+					// Write to output.
+					n, err := writer.Write(toWrite)
+					if err == nil && n != len(toWrite) {
+						err = io.ErrShortBuffer
+					}
+					_ = w.err(err)
+					w.err(w.index.add(w.written, input.startOffset))
+					w.written += int64(n)
+				}
+			}
+			if cap(in) >= w.obufLen {
+				w.buffers.Put(in)
+			}
+			// close the incoming write request.
+			// This can be used for synchronizing flushes.
+			close(write)
+		}
+	}()
+}
+
+// Write satisfies the io.Writer interface.
+func (w *Writer) Write(p []byte) (nRet int, errRet error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if w.flushOnWrite {
+		return w.write(p)
+	}
+	// If we exceed the input buffer size, start writing
+	for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err(nil) == nil {
+		var n int
+		if len(w.ibuf) == 0 {
+			// Large write, empty buffer.
+			// Write directly from p to avoid copy.
+			n, _ = w.write(p)
+		} else {
+			n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+			w.ibuf = w.ibuf[:len(w.ibuf)+n]
+			w.write(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+		}
+		nRet += n
+		p = p[n:]
+	}
+	if err := w.err(nil); err != nil {
+		return nRet, err
+	}
+	// p should always be able to fit into w.ibuf now.
+	n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+	w.ibuf = w.ibuf[:len(w.ibuf)+n]
+	nRet += n
+	return nRet, nil
+}
+
+// ReadFrom implements the io.ReaderFrom interface.
+// Using this is typically more efficient since it avoids a memory copy.
+// ReadFrom reads data from r until EOF or error.
+// The return value n is the number of bytes read.
+// Any error except io.EOF encountered during the read is also returned.
+func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if len(w.ibuf) > 0 {
+		err := w.AsyncFlush()
+		if err != nil {
+			return 0, err
+		}
+	}
+	if br, ok := r.(byter); ok {
+		buf := br.Bytes()
+		if err := w.EncodeBuffer(buf); err != nil {
+			return 0, err
+		}
+		return int64(len(buf)), w.AsyncFlush()
+	}
+	for {
+		inbuf := w.buffers.Get().([]byte)[:w.blockSize+obufHeaderLen]
+		n2, err := io.ReadFull(r, inbuf[obufHeaderLen:])
+		if err != nil {
+			if err == io.ErrUnexpectedEOF {
+				err = io.EOF
+			}
+			if err != io.EOF {
+				return n, w.err(err)
+			}
+		}
+		if n2 == 0 {
+			break
+		}
+		n += int64(n2)
+		err2 := w.writeFull(inbuf[:n2+obufHeaderLen])
+		if w.err(err2) != nil {
+			break
+		}
+
+		if err != nil {
+			// We got EOF and wrote everything
+			break
+		}
+	}
+
+	return n, w.err(nil)
+}
+
+// AddUserChunk will add a (non)skippable chunk to the stream.
+// The ID must be in the range 0x80 -> 0xfe - inclusive.
+// The length of the block must be <= MaxUserChunkSize bytes.
+func (w *Writer) AddUserChunk(id uint8, data []byte) (err error) {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+	if id < MinUserSkippableChunk || id > ChunkTypePadding {
+		return fmt.Errorf("invalid skippable block id %x", id)
+	}
+	if len(data) > MaxUserChunkSize {
+		return fmt.Errorf("user chunk exceeds maximum size")
+	}
+	var header [4]byte
+	chunkLen := len(data)
+	header[0] = id
+	header[1] = uint8(chunkLen >> 0)
+	header[2] = uint8(chunkLen >> 8)
+	header[3] = uint8(chunkLen >> 16)
+	if w.concurrency == 1 {
+		write := func(b []byte) error {
+			n, err := w.writer.Write(b)
+			if err = w.err(err); err != nil {
+				return err
+			}
+			if n != len(b) {
+				return w.err(io.ErrShortWrite)
+			}
+			w.written += int64(n)
+			return w.err(nil)
+		}
+		if !w.wroteStreamHeader {
+			w.wroteStreamHeader = true
+			if err := write(makeHeader(w.blockSize)); err != nil {
+				return err
+			}
+		}
+		if w.uncompWritten > 0 {
+			if err = w.err(w.index.add(w.written, w.uncompWritten)); err != nil {
+				return err
+			}
+		}
+		if err := write(header[:]); err != nil {
+			return err
+		}
+		return write(data)
+	}
+
+	// Create output...
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		hWriter := make(chan result)
+		w.output <- hWriter
+		hWriter <- result{startOffset: w.uncompWritten, b: makeHeader(w.blockSize)}
+	}
+
+	// Copy input.
+	inbuf := w.buffers.Get().([]byte)[:4]
+	copy(inbuf, header[:])
+	inbuf = append(inbuf, data...)
+
+	output := make(chan result, 1)
+	// Queue output.
+	w.output <- output
+	output <- result{startOffset: w.uncompWritten, b: inbuf}
+
+	return nil
+}
+
+// EncodeBuffer will add a buffer to the stream.
+// This is the fastest way to encode a stream,
+// but the input buffer cannot be written to by the caller
+// until Flush or Close has been called when concurrency != 1.
+//
+// If you cannot control that, use the regular Write function.
+//
+// Note that input is not buffered.
+// This means that each write will result in discrete blocks being created.
+// For buffered writes, use the regular Write function.
+func (w *Writer) EncodeBuffer(buf []byte) (err error) {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	if w.flushOnWrite {
+		_, err := w.write(buf)
+		return err
+	}
+	// Flush queued data first.
+	if len(w.ibuf) > 0 {
+		err := w.AsyncFlush()
+		if err != nil {
+			return err
+		}
+	}
+	if w.concurrency == 1 {
+		_, err := w.writeSync(buf)
+		return err
+	}
+
+	// Spawn goroutine and write block to output channel.
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		hWriter := make(chan result)
+		w.output <- hWriter
+		hWriter <- result{startOffset: w.uncompWritten, b: makeHeader(w.blockSize)}
+	}
+
+	for len(buf) > 0 {
+		// Cut input.
+		uncompressed := buf
+		if len(uncompressed) > w.blockSize {
+			uncompressed = uncompressed[:w.blockSize]
+		}
+		buf = buf[len(uncompressed):]
+		// Get an output buffer.
+		obuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
+		output := make(chan result)
+		// Queue output now, so we keep order.
+		w.output <- output
+		res := result{
+			startOffset: w.uncompWritten,
+		}
+		w.uncompWritten += int64(len(uncompressed))
+		go func() {
+			checksum := crc(uncompressed)
+
+			// Set to uncompressed.
+			chunkType := uint8(chunkTypeUncompressedData)
+			chunkLen := 4 + len(uncompressed)
+
+			// Attempt compressing.
+			n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+			n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+			// Check if we should use this, or store as uncompressed instead.
+			if n2 > 0 {
+				chunkType = uint8(chunkTypeMinLZCompressedData)
+				chunkLen = 4 + n + n2
+				obuf = obuf[:obufHeaderLen+n+n2]
+			} else {
+				// copy uncompressed
+				copy(obuf[obufHeaderLen:], uncompressed)
+			}
+
+			// Fill in the per-chunk header that comes before the body.
+			obuf[0] = chunkType
+			obuf[1] = uint8(chunkLen >> 0)
+			obuf[2] = uint8(chunkLen >> 8)
+			obuf[3] = uint8(chunkLen >> 16)
+			obuf[4] = uint8(checksum >> 0)
+			obuf[5] = uint8(checksum >> 8)
+			obuf[6] = uint8(checksum >> 16)
+			obuf[7] = uint8(checksum >> 24)
+
+			// Queue final output.
+			res.b = obuf
+			output <- res
+		}()
+	}
+	return nil
+}
+
+func (w *Writer) encodeBlock(obuf, uncompressed []byte) int {
+	if w.customEnc != nil {
+		if ret := w.customEnc(obuf, uncompressed); ret >= 0 {
+			return ret
+		}
+	}
+	var n int
+
+	switch w.level {
+	case LevelFastest:
+		n = encodeBlock(obuf, uncompressed)
+	case LevelBalanced:
+		n = encodeBlockBetter(obuf, uncompressed)
+	case LevelSmallest:
+		n = encodeBlockBest(obuf, uncompressed, nil)
+	}
+
+	if debugValidateBlocks && n > 0 {
+		fmt.Println("debugValidateBlocks:", len(uncompressed), "->", n)
+		//debug.PrintStack()
+		src := uncompressed
+		block := obuf[:n]
+		dst := make([]byte, len(src))
+		ret := minLZDecode(dst, block)
+		if ret != 0 || !bytes.Equal(dst, src) {
+			n := matchLen(dst, src)
+			x := crc32.ChecksumIEEE(src)
+			name := fmt.Sprintf("errs/block-%08x-%d", x, ret)
+			fmt.Println(name, "mismatch at pos", n)
+			os.WriteFile(name+"input.bin", src, 0644)
+			os.WriteFile(name+"decoded.bin", dst, 0644)
+			os.WriteFile(name+"compressed.bin", block, 0644)
+		}
+	}
+	return n
+}
+
+func (w *Writer) write(p []byte) (nRet int, errRet error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if w.concurrency == 1 {
+		return w.writeSync(p)
+	}
+
+	// Spawn goroutine and write block to output channel.
+	for len(p) > 0 {
+		if !w.wroteStreamHeader {
+			w.wroteStreamHeader = true
+			hWriter := make(chan result)
+			w.output <- hWriter
+			hWriter <- result{startOffset: w.uncompWritten, b: makeHeader(w.blockSize)}
+		}
+
+		var uncompressed []byte
+		if len(p) > w.blockSize {
+			uncompressed, p = p[:w.blockSize], p[w.blockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+
+		// Copy input.
+		// If the block is incompressible, this is used for the result.
+		inbuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
+		obuf := w.buffers.Get().([]byte)[:w.obufLen]
+		copy(inbuf[obufHeaderLen:], uncompressed)
+		uncompressed = inbuf[obufHeaderLen:]
+
+		output := make(chan result)
+		// Queue output now, so we keep order.
+		w.output <- output
+		res := result{
+			startOffset: w.uncompWritten,
+		}
+		w.uncompWritten += int64(len(uncompressed))
+
+		go func() {
+			checksum := crc(uncompressed)
+
+			// Set to uncompressed.
+			chunkType := uint8(chunkTypeUncompressedData)
+			chunkLen := 4 + len(uncompressed)
+
+			// Attempt compressing.
+			n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+			n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+			// Check if we should use this, or store as uncompressed instead.
+			if n2 > 0 {
+				chunkType = uint8(chunkTypeMinLZCompressedData)
+				chunkLen = 4 + n + n2
+				obuf = obuf[:obufHeaderLen+n+n2]
+			} else {
+				// Use input as output.
+				obuf, inbuf = inbuf, obuf
+			}
+
+			// Fill in the per-chunk header that comes before the body.
+			obuf[0] = chunkType
+			obuf[1] = uint8(chunkLen >> 0)
+			obuf[2] = uint8(chunkLen >> 8)
+			obuf[3] = uint8(chunkLen >> 16)
+			obuf[4] = uint8(checksum >> 0)
+			obuf[5] = uint8(checksum >> 8)
+			obuf[6] = uint8(checksum >> 16)
+			obuf[7] = uint8(checksum >> 24)
+
+			// Queue final output.
+			res.b = obuf
+			output <- res
+
+			// Put unused buffer back in pool.
+			w.buffers.Put(inbuf)
+		}()
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// writeFull is a special version of write that will always write the full buffer.
+// Data to be compressed should start at offset obufHeaderLen and fill the remainder of the buffer.
+// The data will be written as a single block.
+// The caller is not allowed to use inbuf after this function has been called.
+func (w *Writer) writeFull(inbuf []byte) (errRet error) {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	if w.concurrency == 1 {
+		_, err := w.writeSync(inbuf[obufHeaderLen:])
+		return err
+	}
+
+	// Spawn goroutine and write block to output channel.
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		hWriter := make(chan result)
+		w.output <- hWriter
+		hWriter <- result{startOffset: w.uncompWritten, b: makeHeader(w.blockSize)}
+	}
+
+	// Get an output buffer.
+	obuf := w.buffers.Get().([]byte)[:w.obufLen]
+	uncompressed := inbuf[obufHeaderLen:]
+
+	output := make(chan result)
+	// Queue output now, so we keep order.
+	w.output <- output
+	res := result{
+		startOffset: w.uncompWritten,
+	}
+	w.uncompWritten += int64(len(uncompressed))
+
+	go func() {
+		checksum := crc(uncompressed)
+
+		// Set to uncompressed.
+		chunkType := uint8(chunkTypeUncompressedData)
+		chunkLen := 4 + len(uncompressed)
+
+		// Attempt compressing.
+		n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+		n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+		// Check if we should use this, or store as uncompressed instead.
+		if n2 > 0 {
+			chunkType = uint8(chunkTypeMinLZCompressedData)
+			chunkLen = 4 + n + n2
+			obuf = obuf[:obufHeaderLen+n+n2]
+		} else {
+			// Use input as output.
+			obuf, inbuf = inbuf, obuf
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		obuf[0] = chunkType
+		obuf[1] = uint8(chunkLen >> 0)
+		obuf[2] = uint8(chunkLen >> 8)
+		obuf[3] = uint8(chunkLen >> 16)
+		obuf[4] = uint8(checksum >> 0)
+		obuf[5] = uint8(checksum >> 8)
+		obuf[6] = uint8(checksum >> 16)
+		obuf[7] = uint8(checksum >> 24)
+
+		// Queue final output.
+		res.b = obuf
+		output <- res
+
+		// Put unused buffer back in pool.
+		w.buffers.Put(inbuf)
+	}()
+	return nil
+}
+
+func (w *Writer) writeSync(p []byte) (nRet int, errRet error) {
+	if err := w.err(nil); err != nil {
+		return 0, err
+	}
+	if !w.wroteStreamHeader {
+		w.wroteStreamHeader = true
+		var n int
+		var err error
+		n, err = w.writer.Write(makeHeader(w.blockSize))
+		if err != nil {
+			return 0, w.err(err)
+		}
+		if n != len(magicChunk)+1 {
+			return 0, w.err(io.ErrShortWrite)
+		}
+		w.written += int64(n)
+	}
+
+	for len(p) > 0 {
+		var uncompressed []byte
+		if len(p) > w.blockSize {
+			uncompressed, p = p[:w.blockSize], p[w.blockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+
+		obuf := w.buffers.Get().([]byte)[:w.obufLen]
+		checksum := crc(uncompressed)
+
+		// Set to uncompressed.
+		chunkType := uint8(chunkTypeUncompressedData)
+		chunkLen := 4 + len(uncompressed)
+
+		// Attempt compressing.
+		n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
+		n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
+
+		if n2 > 0 {
+			chunkType = uint8(chunkTypeMinLZCompressedData)
+			chunkLen = 4 + n + n2
+			obuf = obuf[:obufHeaderLen+n+n2]
+		} else {
+			obuf = obuf[:8]
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		obuf[0] = chunkType
+		obuf[1] = uint8(chunkLen >> 0)
+		obuf[2] = uint8(chunkLen >> 8)
+		obuf[3] = uint8(chunkLen >> 16)
+		obuf[4] = uint8(checksum >> 0)
+		obuf[5] = uint8(checksum >> 8)
+		obuf[6] = uint8(checksum >> 16)
+		obuf[7] = uint8(checksum >> 24)
+
+		n, err := w.writer.Write(obuf)
+		if err != nil {
+			return 0, w.err(err)
+		}
+		if n != len(obuf) {
+			return 0, w.err(io.ErrShortWrite)
+		}
+		w.err(w.index.add(w.written, w.uncompWritten))
+		w.written += int64(n)
+		w.uncompWritten += int64(len(uncompressed))
+
+		if chunkType == chunkTypeUncompressedData {
+			// Write uncompressed data.
+			n, err := w.writer.Write(uncompressed)
+			if err != nil {
+				return 0, w.err(err)
+			}
+			if n != len(uncompressed) {
+				return 0, w.err(io.ErrShortWrite)
+			}
+			w.written += int64(n)
+		}
+		w.buffers.Put(obuf)
+		// Queue final output.
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// AsyncFlush writes any buffered bytes to a block and starts compressing it.
+// It does not wait for the output has been written as Flush() does.
+func (w *Writer) AsyncFlush() error {
+	if err := w.err(nil); err != nil {
+		return err
+	}
+
+	// Queue any data still in input buffer.
+	if len(w.ibuf) != 0 {
+		if !w.wroteStreamHeader {
+			_, err := w.writeSync(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+			return w.err(err)
+		} else {
+			_, err := w.write(w.ibuf)
+			w.ibuf = w.ibuf[:0]
+			err = w.err(err)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	return w.err(nil)
+}
+
+// Flush flushes the Writer to its underlying io.Writer.
+// This does not apply padding.
+func (w *Writer) Flush() error {
+	if err := w.AsyncFlush(); err != nil {
+		return err
+	}
+
+	if w.output == nil {
+		return w.err(nil)
+	}
+
+	// Send empty buffer
+	res := make(chan result)
+	w.output <- res
+	// Block until this has been picked up.
+	res <- result{b: nil, startOffset: w.uncompWritten}
+	// When it is closed, we have flushed.
+	<-res
+	return w.err(nil)
+}
+
+// Close calls Flush and then closes the Writer.
+// This is required to mark the end of the stream.
+// Calling Close multiple times is ok,
+// but calling CloseIndex after this will make it not return the index.
+func (w *Writer) Close() error {
+	_, err := w.closeIndex(w.appendIndex)
+	return err
+}
+
+// Written returns the number of uncompressed (input) and compressed bytes (output)
+// that has been processed since start or last Reset call.
+// This is only safe to call after Flush() or Close/CloseIndex has been called.
+func (w *Writer) Written() (input, output int64) {
+	return w.uncompWritten, w.written
+}
+
+// CloseIndex calls Close and returns an index on first call.
+// This is not required if you are only adding index to a stream.
+func (w *Writer) CloseIndex() ([]byte, error) {
+	return w.closeIndex(true)
+}
+
+func (w *Writer) closeIndex(idx bool) ([]byte, error) {
+	err := w.Flush()
+	if w.output != nil {
+		close(w.output)
+		w.writerWg.Wait()
+		w.output = nil
+	}
+	if idx && w.index == nil {
+		return nil, errors.New("index requested, but was asked to not generate one")
+	}
+	// Write EOF marker.
+	if w.err(err) == nil && w.writer != nil {
+		var tmp [4 + binary.MaxVarintLen64]byte
+		tmp[0] = chunkTypeEOF
+		// Write uncompressed size.
+		n := binary.PutUvarint(tmp[4:], uint64(w.uncompWritten))
+		tmp[1] = uint8(n)
+		n += 4
+		_, err := w.writer.Write(tmp[:n])
+		_ = w.err(err)
+		w.written += int64(n)
+	}
+	var index []byte
+	if w.err(err) == nil && w.writer != nil {
+		// Create index.
+		if idx {
+			compSize := int64(-1)
+			if w.pad <= 1 {
+				compSize = w.written
+			}
+			index = w.index.appendTo(w.ibuf[:0], w.uncompWritten, compSize)
+			// Count as written for padding.
+			if w.appendIndex {
+				w.written += int64(len(index))
+			}
+		}
+		// Add padding.
+		if w.pad > 1 {
+			tmp := w.ibuf[:0]
+			if len(index) > 0 {
+				// Allocate another buffer.
+				tmp = w.buffers.Get().([]byte)[:0]
+				defer w.buffers.Put(tmp)
+			}
+			add := calcSkippableFrame(w.written, int64(w.pad))
+			frame, err := skippableFrame(tmp, add, w.randSrc)
+			if err = w.err(err); err != nil {
+				return nil, err
+			}
+			n, err2 := w.writer.Write(frame)
+			if err2 == nil && n != len(frame) {
+				err2 = io.ErrShortWrite
+			}
+			w.written += int64(n)
+			_ = w.err(err2)
+		}
+		// Add index.
+		if len(index) > 0 && w.appendIndex {
+			n, err2 := w.writer.Write(index)
+			if err2 == nil && n != len(index) {
+				err2 = io.ErrShortWrite
+			}
+			// (index already accounted for in w.written)
+			_ = w.err(err2)
+		}
+	}
+	err = w.err(errClosed)
+	if err == errClosed || err == errNilWriter {
+		return index, nil
+	}
+	return nil, err
+}
+
+// calcSkippableFrame will return a total size to be added for written
+// to be divisible by multiple.
+// The value will always be > skippableFrameHeader.
+// The function will panic if written < 0 or wantMultiple <= 0.
+func calcSkippableFrame(written, wantMultiple int64) int {
+	if wantMultiple <= 0 {
+		panic("wantMultiple <= 0")
+	}
+	if written < 0 {
+		panic("written < 0")
+	}
+	leftOver := written % wantMultiple
+	if leftOver == 0 {
+		return 0
+	}
+	toAdd := wantMultiple - leftOver
+	for toAdd < skippableFrameHeader {
+		toAdd += wantMultiple
+	}
+	return int(toAdd)
+}
+
+// skippableFrame will add a skippable frame with a total size of bytes.
+// total should be >= skippableFrameHeader and < maxBlockSize + skippableFrameHeader
+func skippableFrame(dst []byte, total int, r io.Reader) ([]byte, error) {
+	if total == 0 {
+		return dst, nil
+	}
+	if total < skippableFrameHeader {
+		return dst, fmt.Errorf("s2: requested skippable frame (%d) < 4", total)
+	}
+	if int64(total) >= maxBlockSize+skippableFrameHeader {
+		return dst, fmt.Errorf("s2: requested skippable frame (%d) >= max 1<<24", total)
+	}
+	// Chunk type 0xfe "Section 4.4 Padding (chunk type 0xfe)"
+	dst = append(dst, ChunkTypePadding)
+	f := uint32(total - skippableFrameHeader)
+	// Add chunk length.
+	dst = append(dst, uint8(f), uint8(f>>8), uint8(f>>16))
+	// Add data
+	start := len(dst)
+	dst = append(dst, make([]byte, f)...)
+	_, err := io.ReadFull(r, dst[start:])
+	return dst, err
+}
+
+// WriterOption is an option for creating a encoder.
+type WriterOption func(*Writer) error
+
+// WriterConcurrency will set the concurrency,
+// meaning the maximum number of decoders to run concurrently.
+// The value supplied must be at least 1.
+// By default this will be set to GOMAXPROCS.
+func WriterConcurrency(n int) WriterOption {
+	return func(w *Writer) error {
+		if n <= 0 {
+			return errors.New("concurrency must be at least 1")
+		}
+		w.concurrency = n
+		return nil
+	}
+}
+
+// WriterAddIndex will append an index to the end of a stream
+// when it is closed.
+func WriterAddIndex(b bool) WriterOption {
+	return func(w *Writer) error {
+		if b && !w.genIndex {
+			return errors.New("WriterAddIndex: WriterCreateIndex has been called with false parameter")
+		}
+		w.appendIndex = b
+		return nil
+	}
+}
+
+// WriterLevel will set the compression level.
+func WriterLevel(n int) WriterOption {
+	return func(w *Writer) error {
+		if n < 0 || n > LevelSmallest {
+			return ErrInvalidLevel
+		}
+		w.level = uint8(n)
+		return nil
+	}
+}
+
+// WriterUncompressed will bypass compression.
+// The stream will be written as uncompressed blocks only.
+// If concurrency is > 1 CRC and output will still be done async.
+func WriterUncompressed() WriterOption {
+	return func(w *Writer) error {
+		w.level = 0
+		return nil
+	}
+}
+
+// WriterBlockSize allows to override the default block size.
+// Blocks will be this size or smaller.
+// Minimum size is 4KB and and maximum size is 4MB.
+//
+// Bigger blocks may give bigger throughput on systems with many cores,
+// and will increase compression slightly, but it will limit the possible
+// concurrency for smaller payloads for both encoding and decoding.
+// Default block size is 1MB.
+//
+// When writing Snappy compatible output using WriterSnappyCompat,
+// the maximum block size is 64KB.
+func WriterBlockSize(n int) WriterOption {
+	return func(w *Writer) error {
+		if n > maxBlockSize || n < minBlockSize {
+			return errors.New("minlz: block size out of bounds. Must be <= 4MB and >=4KB")
+		}
+		w.blockSize = n
+		return nil
+	}
+}
+
+// WriterPadding will add padding to all output so the size will be a multiple of n.
+// This can be used to obfuscate the exact output size or make blocks of a certain size.
+// The contents will be a skippable frame, so it will be invisible by the decoder.
+// n must be > 0 and <= 4MB.
+// The padded area will be filled with data from crypto/rand.Reader.
+// The padding will be applied whenever Close is called on the writer.
+func WriterPadding(n int) WriterOption {
+	return func(w *Writer) error {
+		if n <= 0 {
+			return fmt.Errorf("s2: padding must be at least 1")
+		}
+		// No need to waste our time.
+		if n == 1 {
+			w.pad = 0
+		}
+		if n > maxBlockSize {
+			return fmt.Errorf("s2: padding must less than 4MB")
+		}
+		w.pad = n
+		return nil
+	}
+}
+
+// WriterPaddingSrc will get random data for padding from the supplied source.
+// By default, crypto/rand is used.
+func WriterPaddingSrc(reader io.Reader) WriterOption {
+	return func(w *Writer) error {
+		w.randSrc = reader
+		return nil
+	}
+}
+
+// WriterFlushOnWrite will compress blocks on each call to the Write function.
+//
+// This is quite inefficient as blocks size will depend on the write size.
+//
+// Use WriterConcurrency(1) to also make sure that output is flushed.
+// When Write calls return, otherwise they will be written when compression is done.
+func WriterFlushOnWrite() WriterOption {
+	return func(w *Writer) error {
+		w.flushOnWrite = true
+		return nil
+	}
+}
+
+// WriterCustomEncoder allows to override the encoder for blocks on the stream.
+// The function must compress 'src' into 'dst' and return the bytes used in dst as an integer.
+// Block size (initial varint) should not be added by the encoder.
+// Returning value 0 indicates the block could not be compressed.
+// Returning a negative value indicates that compression should be attempted.
+// The function should expect to be called concurrently.
+func WriterCustomEncoder(fn func(dst, src []byte) int) WriterOption {
+	return func(w *Writer) error {
+		w.customEnc = fn
+		return nil
+	}
+}
+
+// WriterCreateIndex allows to disable the default index creation.
+// This can be used when no index will be needed - for example on network streams.
+func WriterCreateIndex(b bool) WriterOption {
+	return func(w *Writer) error {
+		w.genIndex = b
+		if !w.genIndex && w.appendIndex {
+			return errors.New("WriterCreateIndex: Cannot disable when WriterAddIndex has been requested")
+		}
+		return nil
+	}
+}
+
+func makeHeader(blockSize int) []byte {
+	hdr := append(make([]byte, 0, len(magicChunk)+1), magicChunk...)
+	return append(hdr, byte(bits.Len(uint(blockSize-1)))-10)
+}
diff --git a/vendor/github.com/nwaples/rardecode/archive.go b/vendor/github.com/nwaples/rardecode/archive.go
deleted file mode 100644
index 34e4ac2862..0000000000
--- a/vendor/github.com/nwaples/rardecode/archive.go
+++ /dev/null
@@ -1,342 +0,0 @@
-package rardecode
-
-import (
-	"bufio"
-	"bytes"
-	"errors"
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-	"regexp"
-	"strconv"
-	"strings"
-)
-
-const (
-	maxSfxSize = 0x100000 // maximum number of bytes to read when searching for RAR signature
-	sigPrefix  = "Rar!\x1A\x07"
-
-	fileFmt15 = iota + 1 // Version 1.5 archive file format
-	fileFmt50            // Version 5.0 archive file format
-)
-
-var (
-	errNoSig              = errors.New("rardecode: RAR signature not found")
-	errVerMismatch        = errors.New("rardecode: volume version mistmatch")
-	errCorruptHeader      = errors.New("rardecode: corrupt block header")
-	errCorruptFileHeader  = errors.New("rardecode: corrupt file header")
-	errBadHeaderCrc       = errors.New("rardecode: bad header crc")
-	errUnknownArc         = errors.New("rardecode: unknown archive version")
-	errUnknownDecoder     = errors.New("rardecode: unknown decoder version")
-	errUnsupportedDecoder = errors.New("rardecode: unsupported decoder version")
-	errArchiveContinues   = errors.New("rardecode: archive continues in next volume")
-	errArchiveEnd         = errors.New("rardecode: archive end reached")
-	errDecoderOutOfData   = errors.New("rardecode: decoder expected more data than is in packed file")
-
-	reDigits = regexp.MustCompile(`\d+`)
-)
-
-type readBuf []byte
-
-func (b *readBuf) byte() byte {
-	v := (*b)[0]
-	*b = (*b)[1:]
-	return v
-}
-
-func (b *readBuf) uint16() uint16 {
-	v := uint16((*b)[0]) | uint16((*b)[1])<<8
-	*b = (*b)[2:]
-	return v
-}
-
-func (b *readBuf) uint32() uint32 {
-	v := uint32((*b)[0]) | uint32((*b)[1])<<8 | uint32((*b)[2])<<16 | uint32((*b)[3])<<24
-	*b = (*b)[4:]
-	return v
-}
-
-func (b *readBuf) bytes(n int) []byte {
-	v := (*b)[:n]
-	*b = (*b)[n:]
-	return v
-}
-
-func (b *readBuf) uvarint() uint64 {
-	var x uint64
-	var s uint
-	for i, n := range *b {
-		if n < 0x80 {
-			*b = (*b)[i+1:]
-			return x | uint64(n)<<s
-		}
-		x |= uint64(n&0x7f) << s
-		s += 7
-
-	}
-	// if we run out of bytes, just return 0
-	*b = (*b)[len(*b):]
-	return 0
-}
-
-// readFull wraps io.ReadFull to return io.ErrUnexpectedEOF instead
-// of io.EOF when 0 bytes are read.
-func readFull(r io.Reader, buf []byte) error {
-	_, err := io.ReadFull(r, buf)
-	if err == io.EOF {
-		return io.ErrUnexpectedEOF
-	}
-	return err
-}
-
-// findSig searches for the RAR signature and version at the beginning of a file.
-// It searches no more than maxSfxSize bytes.
-func findSig(br *bufio.Reader) (int, error) {
-	for n := 0; n <= maxSfxSize; {
-		b, err := br.ReadSlice(sigPrefix[0])
-		n += len(b)
-		if err == bufio.ErrBufferFull {
-			continue
-		} else if err != nil {
-			if err == io.EOF {
-				err = errNoSig
-			}
-			return 0, err
-		}
-
-		b, err = br.Peek(len(sigPrefix[1:]) + 2)
-		if err != nil {
-			if err == io.EOF {
-				err = errNoSig
-			}
-			return 0, err
-		}
-		if !bytes.HasPrefix(b, []byte(sigPrefix[1:])) {
-			continue
-		}
-		b = b[len(sigPrefix)-1:]
-
-		var ver int
-		switch {
-		case b[0] == 0:
-			ver = fileFmt15
-		case b[0] == 1 && b[1] == 0:
-			ver = fileFmt50
-		default:
-			continue
-		}
-		_, _ = br.ReadSlice('\x00')
-
-		return ver, nil
-	}
-	return 0, errNoSig
-}
-
-// volume extends a fileBlockReader to be used across multiple
-// files in a multi-volume archive
-type volume struct {
-	fileBlockReader
-	f     *os.File      // current file handle
-	br    *bufio.Reader // buffered reader for current volume file
-	dir   string        // volume directory
-	file  string        // current volume file (not including directory)
-	files []string      // full path names for current volume files processed
-	num   int           // volume number
-	old   bool          // uses old naming scheme
-}
-
-func (v *volume) openFile(file string) error {
-	f, err := os.Open(v.dir + file)
-	if err != nil {
-		return err
-	}
-	v.f = f
-	v.file = file
-	return nil
-}
-
-func nextNewVolName(file string) string {
-	// find all numbers in volume name
-	m := reDigits.FindAllStringIndex(file, -1)
-	if l := len(m); l > 1 {
-		// More than 1 match so assume name.part###of###.rar style.
-		// Take the last 2 matches where the first is the volume number.
-		m = m[l-2 : l]
-		if strings.Contains(file[m[0][1]:m[1][0]], ".") || !strings.Contains(file[:m[0][0]], ".") {
-			// Didn't match above style as volume had '.' between the two numbers or didnt have a '.'
-			// before the first match. Use the second number as volume number.
-			m = m[1:]
-		}
-	}
-	// extract and increment volume number
-	lo, hi := m[0][0], m[0][1]
-	n, err := strconv.Atoi(file[lo:hi])
-	if err != nil {
-		n = 0
-	} else {
-		n++
-	}
-	// volume number must use at least the same number of characters as previous volume
-	vol := fmt.Sprintf("%0"+fmt.Sprint(hi-lo)+"d", n)
-	file = file[:lo] + vol + file[hi:]
-	return file
-}
-
-func nextOldVolName(file string) string {
-	// old style volume naming
-	i := strings.LastIndex(file, ".")
-	// For old style naming if 2nd and 3rd character of file extension is not a digit replace
-	// with "00" and ignore any trailing characters.
-	if len(file) < i+4 || file[i+2] < '0' || file[i+2] > '9' || file[i+3] < '0' || file[i+3] > '9' {
-		file = file[:i+2] + "00"
-		return file
-	}
-	// get file extension
-	b := []byte(file[i+1:])
-	// start incrementing volume number digits from rightmost
-	for j := 2; j >= 0; j-- {
-		if b[j] != '9' {
-			b[j]++
-			break
-		}
-		// digit overflow
-		if j == 0 {
-			// last character before '.'
-			b[j] = 'A'
-		} else {
-			// set to '0' and loop to next character
-			b[j] = '0'
-		}
-	}
-	file = file[:i+1] + string(b)
-	return file
-}
-
-// openNextFile opens the next volume file in the archive.
-func (v *volume) openNextFile() error {
-	file := v.file
-	if v.num == 0 {
-		// check file extensions
-		i := strings.LastIndex(file, ".")
-		if i < 0 {
-			// no file extension, add one
-			file += ".rar"
-		} else {
-			ext := strings.ToLower(file[i+1:])
-			// replace with .rar for empty extensions & self extracting archives
-			if ext == "" || ext == "exe" || ext == "sfx" {
-				file = file[:i+1] + "rar"
-			}
-		}
-		if a, ok := v.fileBlockReader.(*archive15); ok {
-			v.old = a.old
-		}
-		// new naming scheme must have volume number in filename
-		if !v.old {
-			if reDigits.FindStringIndex(file) != nil {
-				// found digits, try using new naming scheme
-				err := v.openFile(nextNewVolName(file))
-				if err != nil && os.IsNotExist(err) {
-					// file didn't exist, try old naming scheme
-					oldErr := v.openFile(nextOldVolName(file))
-					if oldErr == nil || !os.IsNotExist(err) {
-						v.old = true
-						return oldErr
-					}
-				}
-				return err
-			}
-			v.old = true
-		}
-	}
-	// new style volume naming
-	if !v.old {
-		file = nextNewVolName(file)
-	} else {
-		file = nextOldVolName(file)
-	}
-	return v.openFile(file)
-}
-
-func (v *volume) next() (*fileBlockHeader, error) {
-	for {
-		var atEOF bool
-
-		h, err := v.fileBlockReader.next()
-		switch err {
-		case errArchiveContinues:
-		case io.EOF:
-			// Read all of volume without finding an end block. The only way
-			// to tell if the archive continues is to try to open the next volume.
-			atEOF = true
-		default:
-			return h, err
-		}
-
-		v.f.Close()
-		err = v.openNextFile() // Open next volume file
-		if err != nil {
-			if atEOF && os.IsNotExist(err) {
-				// volume not found so assume that the archive has ended
-				return nil, io.EOF
-			}
-			return nil, err
-		}
-		v.num++
-		v.br.Reset(v.f)
-		ver, err := findSig(v.br)
-		if err != nil {
-			return nil, err
-		}
-		if v.version() != ver {
-			return nil, errVerMismatch
-		}
-		v.files = append(v.files, v.dir+v.file)
-		v.reset() // reset encryption
-	}
-}
-
-func (v *volume) Close() error {
-	// may be nil if os.Open fails in next()
-	if v.f == nil {
-		return nil
-	}
-	return v.f.Close()
-}
-
-func openVolume(name, password string) (*volume, error) {
-	var err error
-	v := new(volume)
-	v.dir, v.file = filepath.Split(name)
-	v.f, err = os.Open(name)
-	if err != nil {
-		return nil, err
-	}
-	v.br = bufio.NewReader(v.f)
-	v.fileBlockReader, err = newFileBlockReader(v.br, password)
-	if err != nil {
-		v.f.Close()
-		return nil, err
-	}
-	v.files = append(v.files, name)
-	return v, nil
-}
-
-func newFileBlockReader(br *bufio.Reader, pass string) (fileBlockReader, error) {
-	runes := []rune(pass)
-	if len(runes) > maxPassword {
-		pass = string(runes[:maxPassword])
-	}
-	ver, err := findSig(br)
-	if err != nil {
-		return nil, err
-	}
-	switch ver {
-	case fileFmt15:
-		return newArchive15(br, pass), nil
-	case fileFmt50:
-		return newArchive50(br, pass), nil
-	}
-	return nil, errUnknownArc
-}
diff --git a/vendor/github.com/nwaples/rardecode/archive50.go b/vendor/github.com/nwaples/rardecode/archive50.go
deleted file mode 100644
index 1d8f850dcd..0000000000
--- a/vendor/github.com/nwaples/rardecode/archive50.go
+++ /dev/null
@@ -1,475 +0,0 @@
-package rardecode
-
-import (
-	"bufio"
-	"bytes"
-	"crypto/hmac"
-	"crypto/sha256"
-	"errors"
-	"hash"
-	"hash/crc32"
-	"io"
-	"io/ioutil"
-	"time"
-)
-
-const (
-	// block types
-	block5Arc     = 1
-	block5File    = 2
-	block5Service = 3
-	block5Encrypt = 4
-	block5End     = 5
-
-	// block flags
-	block5HasExtra     = 0x0001
-	block5HasData      = 0x0002
-	block5DataNotFirst = 0x0008
-	block5DataNotLast  = 0x0010
-
-	// end block flags
-	endArc5NotLast = 0x0001
-
-	// archive encryption block flags
-	enc5CheckPresent = 0x0001 // password check data is present
-
-	// main archive block flags
-	arc5MultiVol = 0x0001
-	arc5Solid    = 0x0004
-
-	// file block flags
-	file5IsDir          = 0x0001
-	file5HasUnixMtime   = 0x0002
-	file5HasCRC32       = 0x0004
-	file5UnpSizeUnknown = 0x0008
-
-	// file encryption record flags
-	file5EncCheckPresent = 0x0001 // password check data is present
-	file5EncUseMac       = 0x0002 // use MAC instead of plain checksum
-
-	cacheSize50   = 4
-	maxPbkdf2Salt = 64
-	pwCheckSize   = 8
-	maxKdfCount   = 24
-
-	minHeaderSize = 7
-)
-
-var (
-	errBadPassword      = errors.New("rardecode: incorrect password")
-	errCorruptEncrypt   = errors.New("rardecode: corrupt encryption data")
-	errUnknownEncMethod = errors.New("rardecode: unknown encryption method")
-)
-
-type extra struct {
-	ftype uint64  // field type
-	data  readBuf // field data
-}
-
-type blockHeader50 struct {
-	htype    uint64 // block type
-	flags    uint64
-	data     readBuf // block header data
-	extra    []extra // extra fields
-	dataSize int64   // size of block data
-}
-
-// leHash32 wraps a hash.Hash32 to return the result of Sum in little
-// endian format.
-type leHash32 struct {
-	hash.Hash32
-}
-
-func (h leHash32) Sum(b []byte) []byte {
-	s := h.Sum32()
-	return append(b, byte(s), byte(s>>8), byte(s>>16), byte(s>>24))
-}
-
-func newLittleEndianCRC32() hash.Hash32 {
-	return leHash32{crc32.NewIEEE()}
-}
-
-// hash50 implements fileChecksum for RAR 5 archives
-type hash50 struct {
-	hash.Hash        // hash file data is written to
-	sum       []byte // file checksum
-	key       []byte // if present used with hmac in calculating checksum from hash
-}
-
-func (h *hash50) valid() bool {
-	sum := h.Sum(nil)
-	if len(h.key) > 0 {
-		mac := hmac.New(sha256.New, h.key)
-		mac.Write(sum)
-		sum = mac.Sum(sum[:0])
-		if len(h.sum) == 4 {
-			// CRC32
-			for i, v := range sum[4:] {
-				sum[i&3] ^= v
-			}
-			sum = sum[:4]
-		}
-	}
-	return bytes.Equal(sum, h.sum)
-}
-
-// archive50 implements fileBlockReader for RAR 5 file format archives
-type archive50 struct {
-	byteReader               // reader for current block data
-	v          *bufio.Reader // reader for current archive volume
-	pass       []byte
-	blockKey   []byte                // key used to encrypt blocks
-	multi      bool                  // archive is multi-volume
-	solid      bool                  // is a solid archive
-	checksum   hash50                // file checksum
-	dec        decoder               // optional decoder used to unpack file
-	buf        readBuf               // temporary buffer
-	keyCache   [cacheSize50]struct { // encryption key cache
-		kdfCount int
-		salt     []byte
-		keys     [][]byte
-	}
-}
-
-// calcKeys50 calculates the keys used in RAR 5 archive processing.
-// The returned slice of byte slices contains 3 keys.
-// Key 0 is used for block or file decryption.
-// Key 1 is optionally used for file checksum calculation.
-// Key 2 is optionally used for password checking.
-func calcKeys50(pass, salt []byte, kdfCount int) [][]byte {
-	if len(salt) > maxPbkdf2Salt {
-		salt = salt[:maxPbkdf2Salt]
-	}
-	keys := make([][]byte, 3)
-	if len(keys) == 0 {
-		return keys
-	}
-
-	prf := hmac.New(sha256.New, pass)
-	prf.Write(salt)
-	prf.Write([]byte{0, 0, 0, 1})
-
-	t := prf.Sum(nil)
-	u := append([]byte(nil), t...)
-
-	kdfCount--
-
-	for i, iter := range []int{kdfCount, 16, 16} {
-		for iter > 0 {
-			prf.Reset()
-			prf.Write(u)
-			u = prf.Sum(u[:0])
-			for j := range u {
-				t[j] ^= u[j]
-			}
-			iter--
-		}
-		keys[i] = append([]byte(nil), t...)
-	}
-
-	pwcheck := keys[2]
-	for i, v := range pwcheck[pwCheckSize:] {
-		pwcheck[i&(pwCheckSize-1)] ^= v
-	}
-	keys[2] = pwcheck[:pwCheckSize]
-
-	return keys
-}
-
-// getKeys reads kdfcount and salt from b and returns the corresponding encryption keys.
-func (a *archive50) getKeys(b *readBuf) (keys [][]byte, err error) {
-	if len(*b) < 17 {
-		return nil, errCorruptEncrypt
-	}
-	// read kdf count and salt
-	kdfCount := int(b.byte())
-	if kdfCount > maxKdfCount {
-		return nil, errCorruptEncrypt
-	}
-	kdfCount = 1 << uint(kdfCount)
-	salt := b.bytes(16)
-
-	// check cache of keys for match
-	for _, v := range a.keyCache {
-		if kdfCount == v.kdfCount && bytes.Equal(salt, v.salt) {
-			return v.keys, nil
-		}
-	}
-	// not found, calculate keys
-	keys = calcKeys50(a.pass, salt, kdfCount)
-
-	// store in cache
-	copy(a.keyCache[1:], a.keyCache[:])
-	a.keyCache[0].kdfCount = kdfCount
-	a.keyCache[0].salt = append([]byte(nil), salt...)
-	a.keyCache[0].keys = keys
-
-	return keys, nil
-}
-
-// checkPassword calculates if a password is correct given password check data and keys.
-func checkPassword(b *readBuf, keys [][]byte) error {
-	if len(*b) < 12 {
-		return nil // not enough bytes, ignore for the moment
-	}
-	pwcheck := b.bytes(8)
-	sum := b.bytes(4)
-	csum := sha256.Sum256(pwcheck)
-	if bytes.Equal(sum, csum[:len(sum)]) && !bytes.Equal(pwcheck, keys[2]) {
-		return errBadPassword
-	}
-	return nil
-}
-
-// parseFileEncryptionRecord processes the optional file encryption record from a file header.
-func (a *archive50) parseFileEncryptionRecord(b readBuf, f *fileBlockHeader) error {
-	if ver := b.uvarint(); ver != 0 {
-		return errUnknownEncMethod
-	}
-	flags := b.uvarint()
-
-	keys, err := a.getKeys(&b)
-	if err != nil {
-		return err
-	}
-
-	f.key = keys[0]
-	if len(b) < 16 {
-		return errCorruptEncrypt
-	}
-	f.iv = b.bytes(16)
-
-	if flags&file5EncCheckPresent > 0 {
-		if err := checkPassword(&b, keys); err != nil {
-			return err
-		}
-	}
-	if flags&file5EncUseMac > 0 {
-		a.checksum.key = keys[1]
-	}
-	return nil
-}
-
-func (a *archive50) parseFileHeader(h *blockHeader50) (*fileBlockHeader, error) {
-	a.checksum.sum = nil
-	a.checksum.key = nil
-
-	f := new(fileBlockHeader)
-
-	f.first = h.flags&block5DataNotFirst == 0
-	f.last = h.flags&block5DataNotLast == 0
-
-	flags := h.data.uvarint() // file flags
-	f.IsDir = flags&file5IsDir > 0
-	f.UnKnownSize = flags&file5UnpSizeUnknown > 0
-	f.UnPackedSize = int64(h.data.uvarint())
-	f.PackedSize = h.dataSize
-	f.Attributes = int64(h.data.uvarint())
-	if flags&file5HasUnixMtime > 0 {
-		if len(h.data) < 4 {
-			return nil, errCorruptFileHeader
-		}
-		f.ModificationTime = time.Unix(int64(h.data.uint32()), 0)
-	}
-	if flags&file5HasCRC32 > 0 {
-		if len(h.data) < 4 {
-			return nil, errCorruptFileHeader
-		}
-		a.checksum.sum = append([]byte(nil), h.data.bytes(4)...)
-		if f.first {
-			a.checksum.Hash = newLittleEndianCRC32()
-			f.cksum = &a.checksum
-		}
-	}
-
-	flags = h.data.uvarint() // compression flags
-	f.solid = flags&0x0040 > 0
-	f.winSize = uint(flags&0x3C00)>>10 + 17
-	method := (flags >> 7) & 7 // compression method (0 == none)
-	if f.first && method != 0 {
-		unpackver := flags & 0x003f
-		if unpackver != 0 {
-			return nil, errUnknownDecoder
-		}
-		if a.dec == nil {
-			a.dec = new(decoder50)
-		}
-		f.decoder = a.dec
-	}
-	switch h.data.uvarint() {
-	case 0:
-		f.HostOS = HostOSWindows
-	case 1:
-		f.HostOS = HostOSUnix
-	default:
-		f.HostOS = HostOSUnknown
-	}
-	nlen := int(h.data.uvarint())
-	if len(h.data) < nlen {
-		return nil, errCorruptFileHeader
-	}
-	f.Name = string(h.data.bytes(nlen))
-
-	// parse optional extra records
-	for _, e := range h.extra {
-		var err error
-		switch e.ftype {
-		case 1: // encryption
-			err = a.parseFileEncryptionRecord(e.data, f)
-		case 2:
-			// TODO: hash
-		case 3:
-			// TODO: time
-		case 4: // version
-			_ = e.data.uvarint() // ignore flags field
-			f.Version = int(e.data.uvarint())
-		case 5:
-			// TODO: redirection
-		case 6:
-			// TODO: owner
-		}
-		if err != nil {
-			return nil, err
-		}
-	}
-	return f, nil
-}
-
-// parseEncryptionBlock calculates the key for block encryption.
-func (a *archive50) parseEncryptionBlock(b readBuf) error {
-	if ver := b.uvarint(); ver != 0 {
-		return errUnknownEncMethod
-	}
-	flags := b.uvarint()
-	keys, err := a.getKeys(&b)
-	if err != nil {
-		return err
-	}
-	if flags&enc5CheckPresent > 0 {
-		if err := checkPassword(&b, keys); err != nil {
-			return err
-		}
-	}
-	a.blockKey = keys[0]
-	return nil
-}
-
-func (a *archive50) readBlockHeader() (*blockHeader50, error) {
-	r := io.Reader(a.v)
-	if a.blockKey != nil {
-		// block is encrypted
-		iv := a.buf[:16]
-		if err := readFull(r, iv); err != nil {
-			return nil, err
-		}
-		r = newAesDecryptReader(r, a.blockKey, iv)
-	}
-
-	b := a.buf[:minHeaderSize]
-	if err := readFull(r, b); err != nil {
-		return nil, err
-	}
-	crc := b.uint32()
-
-	hash := crc32.NewIEEE()
-	hash.Write(b)
-
-	size := int(b.uvarint()) // header size
-	if size > cap(a.buf) {
-		a.buf = readBuf(make([]byte, size))
-	} else {
-		a.buf = a.buf[:size]
-	}
-	n := copy(a.buf, b)                            // copy left over bytes
-	if err := readFull(r, a.buf[n:]); err != nil { // read rest of header
-		return nil, err
-	}
-
-	// check header crc
-	hash.Write(a.buf[n:])
-	if crc != hash.Sum32() {
-		return nil, errBadHeaderCrc
-	}
-
-	b = a.buf
-	h := new(blockHeader50)
-	h.htype = b.uvarint()
-	h.flags = b.uvarint()
-
-	var extraSize int
-	if h.flags&block5HasExtra > 0 {
-		extraSize = int(b.uvarint())
-	}
-	if h.flags&block5HasData > 0 {
-		h.dataSize = int64(b.uvarint())
-	}
-	if len(b) < extraSize {
-		return nil, errCorruptHeader
-	}
-	h.data = b.bytes(len(b) - extraSize)
-
-	// read header extra records
-	for len(b) > 0 {
-		size = int(b.uvarint())
-		if len(b) < size {
-			return nil, errCorruptHeader
-		}
-		data := readBuf(b.bytes(size))
-		ftype := data.uvarint()
-		h.extra = append(h.extra, extra{ftype, data})
-	}
-
-	return h, nil
-}
-
-// next advances to the next file block in the archive
-func (a *archive50) next() (*fileBlockHeader, error) {
-	for {
-		h, err := a.readBlockHeader()
-		if err != nil {
-			return nil, err
-		}
-		a.byteReader = limitByteReader(a.v, h.dataSize)
-		switch h.htype {
-		case block5File:
-			return a.parseFileHeader(h)
-		case block5Arc:
-			flags := h.data.uvarint()
-			a.multi = flags&arc5MultiVol > 0
-			a.solid = flags&arc5Solid > 0
-		case block5Encrypt:
-			err = a.parseEncryptionBlock(h.data)
-		case block5End:
-			flags := h.data.uvarint()
-			if flags&endArc5NotLast == 0 || !a.multi {
-				return nil, errArchiveEnd
-			}
-			return nil, errArchiveContinues
-		default:
-			// discard block data
-			_, err = io.Copy(ioutil.Discard, a.byteReader)
-		}
-		if err != nil {
-			return nil, err
-		}
-	}
-}
-
-func (a *archive50) version() int { return fileFmt50 }
-
-func (a *archive50) reset() {
-	a.blockKey = nil // reset encryption when opening new volume file
-}
-
-func (a *archive50) isSolid() bool {
-	return a.solid
-}
-
-// newArchive50 creates a new fileBlockReader for a Version 5 archive.
-func newArchive50(r *bufio.Reader, password string) fileBlockReader {
-	a := new(archive50)
-	a.v = r
-	a.pass = []byte(password)
-	a.buf = make([]byte, 100)
-	return a
-}
diff --git a/vendor/github.com/nwaples/rardecode/bit_reader.go b/vendor/github.com/nwaples/rardecode/bit_reader.go
deleted file mode 100644
index 9b284efa31..0000000000
--- a/vendor/github.com/nwaples/rardecode/bit_reader.go
+++ /dev/null
@@ -1,119 +0,0 @@
-package rardecode
-
-import "io"
-
-type bitReader interface {
-	readBits(n uint) (int, error) // read n bits of data
-	unreadBits(n uint)            // revert the reading of the last n bits read
-}
-
-type limitedBitReader struct {
-	br  bitReader
-	n   int
-	err error // error to return if br returns EOF before all n bits have been read
-}
-
-// limitBitReader returns a bitReader that reads from br and stops with io.EOF after n bits.
-// If br returns an io.EOF before reading n bits, err is returned.
-func limitBitReader(br bitReader, n int, err error) bitReader {
-	return &limitedBitReader{br, n, err}
-}
-
-func (l *limitedBitReader) readBits(n uint) (int, error) {
-	if int(n) > l.n {
-		return 0, io.EOF
-	}
-	v, err := l.br.readBits(n)
-	if err == nil {
-		l.n -= int(n)
-	} else if err == io.EOF {
-		err = l.err
-	}
-	return v, err
-}
-
-func (l *limitedBitReader) unreadBits(n uint) {
-	l.n += int(n)
-	l.br.unreadBits(n)
-}
-
-// rarBitReader wraps an io.ByteReader to perform various bit and byte
-// reading utility functions used in RAR file processing.
-type rarBitReader struct {
-	r io.ByteReader
-	v int
-	n uint
-}
-
-func (r *rarBitReader) reset(br io.ByteReader) {
-	r.r = br
-	r.n = 0
-	r.v = 0
-}
-
-func (r *rarBitReader) readBits(n uint) (int, error) {
-	for n > r.n {
-		c, err := r.r.ReadByte()
-		if err != nil {
-			return 0, err
-		}
-		r.v = r.v<<8 | int(c)
-		r.n += 8
-	}
-	r.n -= n
-	return (r.v >> r.n) & ((1 << n) - 1), nil
-}
-
-func (r *rarBitReader) unreadBits(n uint) {
-	r.n += n
-}
-
-// alignByte aligns the current bit reading input to the next byte boundary.
-func (r *rarBitReader) alignByte() {
-	r.n -= r.n % 8
-}
-
-// readUint32 reads a RAR V3 encoded uint32
-func (r *rarBitReader) readUint32() (uint32, error) {
-	n, err := r.readBits(2)
-	if err != nil {
-		return 0, err
-	}
-	if n != 1 {
-		n, err = r.readBits(4 << uint(n))
-		return uint32(n), err
-	}
-	n, err = r.readBits(4)
-	if err != nil {
-		return 0, err
-	}
-	if n == 0 {
-		n, err = r.readBits(8)
-		n |= -1 << 8
-		return uint32(n), err
-	}
-	nlow, err := r.readBits(4)
-	n = n<<4 | nlow
-	return uint32(n), err
-}
-
-func (r *rarBitReader) ReadByte() (byte, error) {
-	n, err := r.readBits(8)
-	return byte(n), err
-}
-
-// readFull reads len(p) bytes into p. If fewer bytes are read an error is returned.
-func (r *rarBitReader) readFull(p []byte) error {
-	for i := range p {
-		c, err := r.ReadByte()
-		if err != nil {
-			return err
-		}
-		p[i] = c
-	}
-	return nil
-}
-
-func newRarBitReader(r io.ByteReader) *rarBitReader {
-	return &rarBitReader{r: r}
-}
diff --git a/vendor/github.com/nwaples/rardecode/decode50.go b/vendor/github.com/nwaples/rardecode/decode50.go
deleted file mode 100644
index 1939a444ab..0000000000
--- a/vendor/github.com/nwaples/rardecode/decode50.go
+++ /dev/null
@@ -1,294 +0,0 @@
-package rardecode
-
-import (
-	"errors"
-	"io"
-)
-
-const (
-	mainSize5      = 306
-	offsetSize5    = 64
-	lowoffsetSize5 = 16
-	lengthSize5    = 44
-	tableSize5     = mainSize5 + offsetSize5 + lowoffsetSize5 + lengthSize5
-)
-
-var (
-	errUnknownFilter       = errors.New("rardecode: unknown V5 filter")
-	errCorruptDecodeHeader = errors.New("rardecode: corrupt decode header")
-)
-
-// decoder50 implements the decoder interface for RAR 5 compression.
-// Decode input it broken up into 1 or more blocks. Each block starts with
-// a header containing block length and optional code length tables to initialize
-// the huffman decoders with.
-type decoder50 struct {
-	r          io.ByteReader
-	br         bitReader // bit reader for current data block
-	codeLength [tableSize5]byte
-
-	lastBlock bool // current block is last block in compressed file
-
-	mainDecoder      huffmanDecoder
-	offsetDecoder    huffmanDecoder
-	lowoffsetDecoder huffmanDecoder
-	lengthDecoder    huffmanDecoder
-
-	offset [4]int
-	length int
-}
-
-func (d *decoder50) init(r io.ByteReader, reset bool) error {
-	d.r = r
-	d.lastBlock = false
-
-	if reset {
-		for i := range d.offset {
-			d.offset[i] = 0
-		}
-		d.length = 0
-		for i := range d.codeLength {
-			d.codeLength[i] = 0
-		}
-	}
-	err := d.readBlockHeader()
-	if err == io.EOF {
-		return errDecoderOutOfData
-	}
-	return err
-}
-
-func (d *decoder50) readBlockHeader() error {
-	flags, err := d.r.ReadByte()
-	if err != nil {
-		return err
-	}
-
-	bytecount := (flags>>3)&3 + 1
-	if bytecount == 4 {
-		return errCorruptDecodeHeader
-	}
-
-	hsum, err := d.r.ReadByte()
-	if err != nil {
-		return err
-	}
-
-	blockBits := int(flags)&0x07 + 1
-	blockBytes := 0
-	sum := 0x5a ^ flags
-	for i := byte(0); i < bytecount; i++ {
-		n, err := d.r.ReadByte()
-		if err != nil {
-			return err
-		}
-		sum ^= n
-		blockBytes |= int(n) << (i * 8)
-	}
-	if sum != hsum { // bad header checksum
-		return errCorruptDecodeHeader
-	}
-	blockBits += (blockBytes - 1) * 8
-
-	// create bit reader for block
-	d.br = limitBitReader(newRarBitReader(d.r), blockBits, errDecoderOutOfData)
-	d.lastBlock = flags&0x40 > 0
-
-	if flags&0x80 > 0 {
-		// read new code length tables and reinitialize huffman decoders
-		cl := d.codeLength[:]
-		err = readCodeLengthTable(d.br, cl, false)
-		if err != nil {
-			return err
-		}
-		d.mainDecoder.init(cl[:mainSize5])
-		cl = cl[mainSize5:]
-		d.offsetDecoder.init(cl[:offsetSize5])
-		cl = cl[offsetSize5:]
-		d.lowoffsetDecoder.init(cl[:lowoffsetSize5])
-		cl = cl[lowoffsetSize5:]
-		d.lengthDecoder.init(cl)
-	}
-	return nil
-}
-
-func slotToLength(br bitReader, n int) (int, error) {
-	if n >= 8 {
-		bits := uint(n/4 - 1)
-		n = (4 | (n & 3)) << bits
-		if bits > 0 {
-			b, err := br.readBits(bits)
-			if err != nil {
-				return 0, err
-			}
-			n |= b
-		}
-	}
-	n += 2
-	return n, nil
-}
-
-// readFilter5Data reads an encoded integer used in V5 filters.
-func readFilter5Data(br bitReader) (int, error) {
-	// TODO: should data really be uint? (for 32bit ints).
-	// It will be masked later anyway by decode window mask.
-	bytes, err := br.readBits(2)
-	if err != nil {
-		return 0, err
-	}
-	bytes++
-
-	var data int
-	for i := 0; i < bytes; i++ {
-		n, err := br.readBits(8)
-		if err != nil {
-			return 0, err
-		}
-		data |= n << (uint(i) * 8)
-	}
-	return data, nil
-}
-
-func readFilter(br bitReader) (*filterBlock, error) {
-	fb := new(filterBlock)
-	var err error
-
-	fb.offset, err = readFilter5Data(br)
-	if err != nil {
-		return nil, err
-	}
-	fb.length, err = readFilter5Data(br)
-	if err != nil {
-		return nil, err
-	}
-	ftype, err := br.readBits(3)
-	if err != nil {
-		return nil, err
-	}
-	switch ftype {
-	case 0:
-		n, err := br.readBits(5)
-		if err != nil {
-			return nil, err
-		}
-		fb.filter = func(buf []byte, offset int64) ([]byte, error) { return filterDelta(n+1, buf) }
-	case 1:
-		fb.filter = func(buf []byte, offset int64) ([]byte, error) { return filterE8(0xe8, true, buf, offset) }
-	case 2:
-		fb.filter = func(buf []byte, offset int64) ([]byte, error) { return filterE8(0xe9, true, buf, offset) }
-	case 3:
-		fb.filter = filterArm
-	default:
-		return nil, errUnknownFilter
-	}
-	return fb, nil
-}
-
-func (d *decoder50) decodeSym(win *window, sym int) (*filterBlock, error) {
-	switch {
-	case sym < 256:
-		// literal
-		win.writeByte(byte(sym))
-		return nil, nil
-	case sym == 256:
-		f, err := readFilter(d.br)
-		f.offset += win.buffered()
-		return f, err
-	case sym == 257:
-		// use previous offset and length
-	case sym < 262:
-		i := sym - 258
-		offset := d.offset[i]
-		copy(d.offset[1:i+1], d.offset[:i])
-		d.offset[0] = offset
-
-		sl, err := d.lengthDecoder.readSym(d.br)
-		if err != nil {
-			return nil, err
-		}
-		d.length, err = slotToLength(d.br, sl)
-		if err != nil {
-			return nil, err
-		}
-	default:
-		length, err := slotToLength(d.br, sym-262)
-		if err != nil {
-			return nil, err
-		}
-
-		offset := 1
-		slot, err := d.offsetDecoder.readSym(d.br)
-		if err != nil {
-			return nil, err
-		}
-		if slot < 4 {
-			offset += slot
-		} else {
-			bits := uint(slot/2 - 1)
-			offset += (2 | (slot & 1)) << bits
-
-			if bits >= 4 {
-				if bits > 4 {
-					n, err := d.br.readBits(bits - 4)
-					if err != nil {
-						return nil, err
-					}
-					offset += n << 4
-				}
-				n, err := d.lowoffsetDecoder.readSym(d.br)
-				if err != nil {
-					return nil, err
-				}
-				offset += n
-			} else {
-				n, err := d.br.readBits(bits)
-				if err != nil {
-					return nil, err
-				}
-				offset += n
-			}
-		}
-		if offset > 0x100 {
-			length++
-			if offset > 0x2000 {
-				length++
-				if offset > 0x40000 {
-					length++
-				}
-			}
-		}
-		copy(d.offset[1:], d.offset[:])
-		d.offset[0] = offset
-		d.length = length
-	}
-	win.copyBytes(d.length, d.offset[0])
-	return nil, nil
-}
-
-func (d *decoder50) fill(w *window) ([]*filterBlock, error) {
-	var fl []*filterBlock
-
-	for w.available() > 0 {
-		sym, err := d.mainDecoder.readSym(d.br)
-		if err == nil {
-			var f *filterBlock
-			f, err = d.decodeSym(w, sym)
-			if f != nil {
-				fl = append(fl, f)
-			}
-		} else if err == io.EOF {
-			// reached end of the block
-			if d.lastBlock {
-				return fl, io.EOF
-			}
-			err = d.readBlockHeader()
-		}
-		if err != nil {
-			if err == io.EOF {
-				return fl, errDecoderOutOfData
-			}
-			return fl, err
-		}
-	}
-	return fl, nil
-}
diff --git a/vendor/github.com/nwaples/rardecode/decode_reader.go b/vendor/github.com/nwaples/rardecode/decode_reader.go
deleted file mode 100644
index 36699f9aa5..0000000000
--- a/vendor/github.com/nwaples/rardecode/decode_reader.go
+++ /dev/null
@@ -1,290 +0,0 @@
-package rardecode
-
-import (
-	"errors"
-	"io"
-)
-
-const (
-	minWindowSize    = 0x40000
-	maxQueuedFilters = 8192
-)
-
-var (
-	errTooManyFilters = errors.New("rardecode: too many filters")
-	errInvalidFilter  = errors.New("rardecode: invalid filter")
-)
-
-// filter functions take a byte slice, the current output offset and
-// returns transformed data.
-type filter func(b []byte, offset int64) ([]byte, error)
-
-// filterBlock is a block of data to be processed by a filter.
-type filterBlock struct {
-	length int    // length of block
-	offset int    // bytes to be read before start of block
-	reset  bool   // drop all existing queued filters
-	filter filter // filter function
-}
-
-// decoder is the interface for decoding compressed data
-type decoder interface {
-	init(r io.ByteReader, reset bool) error // initialize decoder for current file
-	fill(w *window) ([]*filterBlock, error) // fill window with decoded data, returning any filters
-}
-
-// window is a sliding window buffer.
-type window struct {
-	buf  []byte
-	mask int // buf length mask
-	r    int // index in buf for reads (beginning)
-	w    int // index in buf for writes (end)
-	l    int // length of bytes to be processed by copyBytes
-	o    int // offset of bytes to be processed by copyBytes
-}
-
-// buffered returns the number of bytes yet to be read from window
-func (w *window) buffered() int { return (w.w - w.r) & w.mask }
-
-// available returns the number of bytes that can be written before the window is full
-func (w *window) available() int { return (w.r - w.w - 1) & w.mask }
-
-func (w *window) reset(log2size uint, clear bool) {
-	size := 1 << log2size
-	if size < minWindowSize {
-		size = minWindowSize
-	}
-	if size > len(w.buf) {
-		b := make([]byte, size)
-		if clear {
-			w.w = 0
-		} else if len(w.buf) > 0 {
-			n := copy(b, w.buf[w.w:])
-			n += copy(b[n:], w.buf[:w.w])
-			w.w = n
-		}
-		w.buf = b
-		w.mask = size - 1
-	} else if clear {
-		for i := range w.buf {
-			w.buf[i] = 0
-		}
-		w.w = 0
-	}
-	w.r = w.w
-}
-
-// writeByte writes c to the end of the window
-func (w *window) writeByte(c byte) {
-	w.buf[w.w] = c
-	w.w = (w.w + 1) & w.mask
-}
-
-// copyBytes copies len bytes at off distance from the end
-// to the end of the window.
-func (w *window) copyBytes(len, off int) {
-	len &= w.mask
-
-	n := w.available()
-	if len > n {
-		// if there is not enough space availaible we copy
-		// as much as we can and save the offset and length
-		// of the remaining data to be copied later.
-		w.l = len - n
-		w.o = off
-		len = n
-	}
-
-	i := (w.w - off) & w.mask
-	for ; len > 0; len-- {
-		w.buf[w.w] = w.buf[i]
-		w.w = (w.w + 1) & w.mask
-		i = (i + 1) & w.mask
-	}
-}
-
-// read reads bytes from the beginning of the window into p
-func (w *window) read(p []byte) (n int) {
-	if w.r > w.w {
-		n = copy(p, w.buf[w.r:])
-		w.r = (w.r + n) & w.mask
-		p = p[n:]
-	}
-	if w.r < w.w {
-		l := copy(p, w.buf[w.r:w.w])
-		w.r += l
-		n += l
-	}
-	if w.l > 0 && n > 0 {
-		// if we have successfully read data, copy any
-		// leftover data from a previous copyBytes.
-		l := w.l
-		w.l = 0
-		w.copyBytes(l, w.o)
-	}
-	return n
-}
-
-// decodeReader implements io.Reader for decoding compressed data in RAR archives.
-type decodeReader struct {
-	win     window  // sliding window buffer used as decode dictionary
-	dec     decoder // decoder being used to unpack file
-	tot     int64   // total bytes read
-	buf     []byte  // filter input/output buffer
-	outbuf  []byte  // filter output not yet read
-	err     error
-	filters []*filterBlock // list of filterBlock's, each with offset relative to previous in list
-}
-
-func (d *decodeReader) init(r io.ByteReader, dec decoder, winsize uint, reset bool) error {
-	if reset {
-		d.filters = nil
-	}
-	d.err = nil
-	d.outbuf = nil
-	d.tot = 0
-	d.win.reset(winsize, reset)
-	d.dec = dec
-	return d.dec.init(r, reset)
-}
-
-func (d *decodeReader) readErr() error {
-	err := d.err
-	d.err = nil
-	return err
-}
-
-// queueFilter adds a filterBlock to the end decodeReader's filters.
-func (d *decodeReader) queueFilter(f *filterBlock) error {
-	if f.reset {
-		d.filters = nil
-	}
-	if len(d.filters) >= maxQueuedFilters {
-		return errTooManyFilters
-	}
-	// make offset relative to previous filter in list
-	for _, fb := range d.filters {
-		if f.offset < fb.offset {
-			// filter block must not start before previous filter
-			return errInvalidFilter
-		}
-		f.offset -= fb.offset
-	}
-	// offset & length must be < window size
-	f.offset &= d.win.mask
-	f.length &= d.win.mask
-	d.filters = append(d.filters, f)
-	return nil
-}
-
-// processFilters processes any filters valid at the current read index
-// and stores the output in outbuf.
-func (d *decodeReader) processFilters() (err error) {
-	f := d.filters[0]
-	if f.offset > 0 {
-		return nil
-	}
-	d.filters = d.filters[1:]
-	if d.win.buffered() < f.length {
-		// fill() didn't return enough bytes
-		err = d.readErr()
-		if err == nil || err == io.EOF {
-			return errInvalidFilter
-		}
-		return err
-	}
-
-	if cap(d.buf) < f.length {
-		d.buf = make([]byte, f.length)
-	}
-	d.outbuf = d.buf[:f.length]
-	n := d.win.read(d.outbuf)
-	for {
-		// run filter passing buffer and total bytes read so far
-		d.outbuf, err = f.filter(d.outbuf, d.tot)
-		if err != nil {
-			return err
-		}
-		if cap(d.outbuf) > cap(d.buf) {
-			// Filter returned a bigger buffer, save it for future filters.
-			d.buf = d.outbuf
-		}
-		if len(d.filters) == 0 {
-			return nil
-		}
-		f = d.filters[0]
-
-		if f.offset != 0 {
-			// next filter not at current offset
-			f.offset -= n
-			return nil
-		}
-		if f.length != len(d.outbuf) {
-			return errInvalidFilter
-		}
-		d.filters = d.filters[1:]
-
-		if cap(d.outbuf) < cap(d.buf) {
-			// Filter returned a smaller buffer. Copy it back to the saved buffer
-			// so the next filter can make use of the larger buffer if needed.
-			d.outbuf = append(d.buf[:0], d.outbuf...)
-		}
-	}
-}
-
-// fill fills the decodeReader's window
-func (d *decodeReader) fill() {
-	if d.err != nil {
-		return
-	}
-	var fl []*filterBlock
-	fl, d.err = d.dec.fill(&d.win) // fill window using decoder
-	for _, f := range fl {
-		err := d.queueFilter(f)
-		if err != nil {
-			d.err = err
-			return
-		}
-	}
-}
-
-// Read decodes data and stores it in p.
-func (d *decodeReader) Read(p []byte) (n int, err error) {
-	if len(d.outbuf) == 0 {
-		// no filter output, see if we need to create more
-		if d.win.buffered() == 0 {
-			// fill empty window
-			d.fill()
-			if d.win.buffered() == 0 {
-				return 0, d.readErr()
-			}
-		} else if len(d.filters) > 0 {
-			f := d.filters[0]
-			if f.offset == 0 && f.length > d.win.buffered() {
-				d.fill() // filter at current offset needs more data
-			}
-		}
-		if len(d.filters) > 0 {
-			if err := d.processFilters(); err != nil {
-				return 0, err
-			}
-		}
-	}
-	if len(d.outbuf) > 0 {
-		// copy filter output into p
-		n = copy(p, d.outbuf)
-		d.outbuf = d.outbuf[n:]
-	} else if len(d.filters) > 0 {
-		f := d.filters[0]
-		if f.offset < len(p) {
-			// only read data up to beginning of next filter
-			p = p[:f.offset]
-		}
-		n = d.win.read(p) // read directly from window
-		f.offset -= n     // adjust first filter offset by bytes just read
-	} else {
-		n = d.win.read(p) // read directly from window
-	}
-	d.tot += int64(n)
-	return n, nil
-}
diff --git a/vendor/github.com/nwaples/rardecode/decrypt_reader.go b/vendor/github.com/nwaples/rardecode/decrypt_reader.go
deleted file mode 100644
index bb9f279c43..0000000000
--- a/vendor/github.com/nwaples/rardecode/decrypt_reader.go
+++ /dev/null
@@ -1,126 +0,0 @@
-package rardecode
-
-import (
-	"crypto/aes"
-	"crypto/cipher"
-	"io"
-)
-
-// cipherBlockReader implements Block Mode decryption of an io.Reader object.
-type cipherBlockReader struct {
-	r      io.Reader
-	mode   cipher.BlockMode
-	inbuf  []byte // input buffer for partial data block
-	outbuf []byte // output buffer used when output slice < block size
-	n      int    // bytes read from outbuf
-	err    error
-}
-
-// read reads and decrypts one or more input blocks into p.
-// len(p) must be >= cipher block size.
-func (cr *cipherBlockReader) read(p []byte) (n int, err error) {
-	bs := cr.mode.BlockSize()
-	// round p down to a multiple of the block size
-	l := len(p) - len(p)%bs
-	p = p[:l]
-
-	l = len(cr.inbuf)
-	if l > 0 {
-		// copy any buffered input into p
-		copy(p, cr.inbuf)
-		cr.inbuf = cr.inbuf[:0]
-	}
-	// read data for at least one block
-	n, err = io.ReadAtLeast(cr.r, p[l:], bs-l)
-	n += l
-	p = p[:n]
-
-	l = n % bs
-	// check if p is a multiple of the cipher block size
-	if l > 0 {
-		n -= l
-		// save trailing partial block to process later
-		cr.inbuf = append(cr.inbuf, p[n:]...)
-		p = p[:n]
-	}
-
-	if err != nil {
-		if err == io.ErrUnexpectedEOF || err == io.ErrShortBuffer {
-			// ignore trailing bytes < block size length
-			err = io.EOF
-		}
-		return 0, err
-	}
-	cr.mode.CryptBlocks(p, p) // decrypt block(s)
-	return n, nil
-}
-
-// Read reads and decrypts data into p.
-// If the input is not a multiple of the cipher block size,
-// the trailing bytes will be ignored.
-func (cr *cipherBlockReader) Read(p []byte) (n int, err error) {
-	for {
-		if cr.n < len(cr.outbuf) {
-			// return buffered output
-			n = copy(p, cr.outbuf[cr.n:])
-			cr.n += n
-			return n, nil
-		}
-		if cr.err != nil {
-			err = cr.err
-			cr.err = nil
-			return 0, err
-		}
-		if len(p) >= cap(cr.outbuf) {
-			break
-		}
-		// p is not large enough to process a block, use outbuf instead
-		n, cr.err = cr.read(cr.outbuf[:cap(cr.outbuf)])
-		cr.outbuf = cr.outbuf[:n]
-		cr.n = 0
-	}
-	// read blocks into p
-	return cr.read(p)
-}
-
-// ReadByte returns the next decrypted byte.
-func (cr *cipherBlockReader) ReadByte() (byte, error) {
-	for {
-		if cr.n < len(cr.outbuf) {
-			c := cr.outbuf[cr.n]
-			cr.n++
-			return c, nil
-		}
-		if cr.err != nil {
-			err := cr.err
-			cr.err = nil
-			return 0, err
-		}
-		// refill outbuf
-		var n int
-		n, cr.err = cr.read(cr.outbuf[:cap(cr.outbuf)])
-		cr.outbuf = cr.outbuf[:n]
-		cr.n = 0
-	}
-}
-
-// newCipherBlockReader returns a cipherBlockReader that decrypts the given io.Reader using
-// the provided block mode cipher.
-func newCipherBlockReader(r io.Reader, mode cipher.BlockMode) *cipherBlockReader {
-	cr := &cipherBlockReader{r: r, mode: mode}
-	cr.outbuf = make([]byte, 0, mode.BlockSize())
-	cr.inbuf = make([]byte, 0, mode.BlockSize())
-	return cr
-}
-
-// newAesDecryptReader returns a cipherBlockReader that decrypts input from a given io.Reader using AES.
-// It will panic if the provided key is invalid.
-func newAesDecryptReader(r io.Reader, key, iv []byte) *cipherBlockReader {
-	block, err := aes.NewCipher(key)
-	if err != nil {
-		panic(err)
-	}
-	mode := cipher.NewCBCDecrypter(block, iv)
-
-	return newCipherBlockReader(r, mode)
-}
diff --git a/vendor/github.com/nwaples/rardecode/reader.go b/vendor/github.com/nwaples/rardecode/reader.go
deleted file mode 100644
index 11adc4fea7..0000000000
--- a/vendor/github.com/nwaples/rardecode/reader.go
+++ /dev/null
@@ -1,376 +0,0 @@
-package rardecode
-
-import (
-	"bufio"
-	"bytes"
-	"errors"
-	"io"
-	"io/ioutil"
-	"os"
-	"time"
-)
-
-// FileHeader HostOS types
-const (
-	HostOSUnknown = 0
-	HostOSMSDOS   = 1
-	HostOSOS2     = 2
-	HostOSWindows = 3
-	HostOSUnix    = 4
-	HostOSMacOS   = 5
-	HostOSBeOS    = 6
-)
-
-const (
-	maxPassword = 128
-)
-
-var (
-	errShortFile        = errors.New("rardecode: decoded file too short")
-	errInvalidFileBlock = errors.New("rardecode: invalid file block")
-	errUnexpectedArcEnd = errors.New("rardecode: unexpected end of archive")
-	errBadFileChecksum  = errors.New("rardecode: bad file checksum")
-)
-
-type byteReader interface {
-	io.Reader
-	io.ByteReader
-}
-
-type limitedReader struct {
-	r        io.Reader
-	n        int64 // bytes remaining
-	shortErr error // error returned when r returns io.EOF with n > 0
-}
-
-func (l *limitedReader) Read(p []byte) (int, error) {
-	if l.n <= 0 {
-		return 0, io.EOF
-	}
-	if int64(len(p)) > l.n {
-		p = p[0:l.n]
-	}
-	n, err := l.r.Read(p)
-	l.n -= int64(n)
-	if err == io.EOF && l.n > 0 {
-		return n, l.shortErr
-	}
-	return n, err
-}
-
-type limitedByteReader struct {
-	limitedReader
-	br io.ByteReader
-}
-
-func (l *limitedByteReader) ReadByte() (byte, error) {
-	if l.n <= 0 {
-		return 0, io.EOF
-	}
-	c, err := l.br.ReadByte()
-	if err == nil {
-		l.n--
-	} else if err == io.EOF && l.n > 0 {
-		return 0, l.shortErr
-	}
-	return c, err
-}
-
-// limitByteReader returns a limitedByteReader that reads from r and stops with
-// io.EOF after n bytes.
-// If r returns an io.EOF before reading n bytes, io.ErrUnexpectedEOF is returned.
-func limitByteReader(r byteReader, n int64) *limitedByteReader {
-	return &limitedByteReader{limitedReader{r, n, io.ErrUnexpectedEOF}, r}
-}
-
-// fileChecksum allows file checksum validations to be performed.
-// File contents must first be written to fileChecksum. Then valid is
-// called to perform the file checksum calculation to determine
-// if the file contents are valid or not.
-type fileChecksum interface {
-	io.Writer
-	valid() bool
-}
-
-// FileHeader represents a single file in a RAR archive.
-type FileHeader struct {
-	Name             string    // file name using '/' as the directory separator
-	IsDir            bool      // is a directory
-	HostOS           byte      // Host OS the archive was created on
-	Attributes       int64     // Host OS specific file attributes
-	PackedSize       int64     // packed file size (or first block if the file spans volumes)
-	UnPackedSize     int64     // unpacked file size
-	UnKnownSize      bool      // unpacked file size is not known
-	ModificationTime time.Time // modification time (non-zero if set)
-	CreationTime     time.Time // creation time (non-zero if set)
-	AccessTime       time.Time // access time (non-zero if set)
-	Version          int       // file version
-}
-
-// Mode returns an os.FileMode for the file, calculated from the Attributes field.
-func (f *FileHeader) Mode() os.FileMode {
-	var m os.FileMode
-
-	if f.IsDir {
-		m = os.ModeDir
-	}
-	if f.HostOS == HostOSWindows {
-		if f.IsDir {
-			m |= 0777
-		} else if f.Attributes&1 > 0 {
-			m |= 0444 // readonly
-		} else {
-			m |= 0666
-		}
-		return m
-	}
-	// assume unix perms for all remaining os types
-	m |= os.FileMode(f.Attributes) & os.ModePerm
-
-	// only check other bits on unix host created archives
-	if f.HostOS != HostOSUnix {
-		return m
-	}
-
-	if f.Attributes&0x200 != 0 {
-		m |= os.ModeSticky
-	}
-	if f.Attributes&0x400 != 0 {
-		m |= os.ModeSetgid
-	}
-	if f.Attributes&0x800 != 0 {
-		m |= os.ModeSetuid
-	}
-
-	// Check for additional file types.
-	if f.Attributes&0xF000 == 0xA000 {
-		m |= os.ModeSymlink
-	}
-	return m
-}
-
-// fileBlockHeader represents a file block in a RAR archive.
-// Files may comprise one or more file blocks.
-// Solid files retain decode tables and dictionary from previous solid files in the archive.
-type fileBlockHeader struct {
-	first   bool         // first block in file
-	last    bool         // last block in file
-	solid   bool         // file is solid
-	winSize uint         // log base 2 of decode window size
-	cksum   fileChecksum // file checksum
-	decoder decoder      // decoder to use for file
-	key     []byte       // key for AES, non-empty if file encrypted
-	iv      []byte       // iv for AES, non-empty if file encrypted
-	FileHeader
-}
-
-// fileBlockReader provides sequential access to file blocks in a RAR archive.
-type fileBlockReader interface {
-	io.Reader                        // Read's read data from the current file block
-	io.ByteReader                    // Read bytes from current file block
-	next() (*fileBlockHeader, error) // reads the next file block header at current position
-	reset()                          // resets encryption
-	isSolid() bool                   // is archive solid
-	version() int                    // returns current archive format version
-}
-
-// packedFileReader provides sequential access to packed files in a RAR archive.
-type packedFileReader struct {
-	r fileBlockReader
-	h *fileBlockHeader // current file header
-}
-
-// nextBlockInFile reads the next file block in the current file at the current
-// archive file position, or returns an error if there is a problem.
-// It is invalid to call this when already at the last block in the current file.
-func (f *packedFileReader) nextBlockInFile() error {
-	h, err := f.r.next()
-	if err != nil {
-		if err == io.EOF {
-			// archive ended, but file hasn't
-			return errUnexpectedArcEnd
-		}
-		return err
-	}
-	if h.first || h.Name != f.h.Name {
-		return errInvalidFileBlock
-	}
-	f.h = h
-	return nil
-}
-
-// next advances to the next packed file in the RAR archive.
-func (f *packedFileReader) next() (*fileBlockHeader, error) {
-	if f.h != nil {
-		// skip to last block in current file
-		for !f.h.last {
-			// discard remaining block data
-			if _, err := io.Copy(ioutil.Discard, f.r); err != nil {
-				return nil, err
-			}
-			if err := f.nextBlockInFile(); err != nil {
-				return nil, err
-			}
-		}
-		// discard last block data
-		if _, err := io.Copy(ioutil.Discard, f.r); err != nil {
-			return nil, err
-		}
-	}
-	var err error
-	f.h, err = f.r.next() // get next file block
-	if err != nil {
-		if err == errArchiveEnd {
-			return nil, io.EOF
-		}
-		return nil, err
-	}
-	if !f.h.first {
-		return nil, errInvalidFileBlock
-	}
-	return f.h, nil
-}
-
-// Read reads the packed data for the current file into p.
-func (f *packedFileReader) Read(p []byte) (int, error) {
-	n, err := f.r.Read(p) // read current block data
-	for err == io.EOF {   // current block empty
-		if n > 0 {
-			return n, nil
-		}
-		if f.h == nil || f.h.last {
-			return 0, io.EOF // last block so end of file
-		}
-		if err := f.nextBlockInFile(); err != nil {
-			return 0, err
-		}
-		n, err = f.r.Read(p) // read new block data
-	}
-	return n, err
-}
-
-func (f *packedFileReader) ReadByte() (byte, error) {
-	c, err := f.r.ReadByte()                       // read current block data
-	for err == io.EOF && f.h != nil && !f.h.last { // current block empty
-		if err := f.nextBlockInFile(); err != nil {
-			return 0, err
-		}
-		c, err = f.r.ReadByte() // read new block data
-	}
-	return c, err
-}
-
-// Reader provides sequential access to files in a RAR archive.
-type Reader struct {
-	r      io.Reader        // reader for current unpacked file
-	pr     packedFileReader // reader for current packed file
-	dr     decodeReader     // reader for decoding and filters if file is compressed
-	cksum  fileChecksum     // current file checksum
-	solidr io.Reader        // reader for solid file
-}
-
-// Read reads from the current file in the RAR archive.
-func (r *Reader) Read(p []byte) (int, error) {
-	n, err := r.r.Read(p)
-	if err == io.EOF && r.cksum != nil && !r.cksum.valid() {
-		return n, errBadFileChecksum
-	}
-	return n, err
-}
-
-// Next advances to the next file in the archive.
-func (r *Reader) Next() (*FileHeader, error) {
-	if r.solidr != nil {
-		// solid files must be read fully to update decoder information
-		if _, err := io.Copy(ioutil.Discard, r.solidr); err != nil {
-			return nil, err
-		}
-	}
-
-	h, err := r.pr.next() // skip to next file
-	if err != nil {
-		return nil, err
-	}
-	r.solidr = nil
-
-	br := byteReader(&r.pr) // start with packed file reader
-
-	// check for encryption
-	if len(h.key) > 0 && len(h.iv) > 0 {
-		br = newAesDecryptReader(br, h.key, h.iv) // decrypt
-	}
-	r.r = br
-	// check for compression
-	if h.decoder != nil {
-		err = r.dr.init(br, h.decoder, h.winSize, !h.solid)
-		if err != nil {
-			return nil, err
-		}
-		r.r = &r.dr
-		if r.pr.r.isSolid() {
-			r.solidr = r.r
-		}
-	}
-	if h.UnPackedSize >= 0 && !h.UnKnownSize {
-		// Limit reading to UnPackedSize as there may be padding
-		r.r = &limitedReader{r.r, h.UnPackedSize, errShortFile}
-	}
-	r.cksum = h.cksum
-	if r.cksum != nil {
-		r.r = io.TeeReader(r.r, h.cksum) // write file data to checksum as it is read
-	}
-	fh := new(FileHeader)
-	*fh = h.FileHeader
-	return fh, nil
-}
-
-func (r *Reader) init(fbr fileBlockReader) {
-	r.r = bytes.NewReader(nil) // initial reads will always return EOF
-	r.pr.r = fbr
-}
-
-// NewReader creates a Reader reading from r.
-// NewReader only supports single volume archives.
-// Multi-volume archives must use OpenReader.
-func NewReader(r io.Reader, password string) (*Reader, error) {
-	br, ok := r.(*bufio.Reader)
-	if !ok {
-		br = bufio.NewReader(r)
-	}
-	fbr, err := newFileBlockReader(br, password)
-	if err != nil {
-		return nil, err
-	}
-	rr := new(Reader)
-	rr.init(fbr)
-	return rr, nil
-}
-
-type ReadCloser struct {
-	v *volume
-	Reader
-}
-
-// Close closes the rar file.
-func (rc *ReadCloser) Close() error {
-	return rc.v.Close()
-}
-
-// Volumes returns the volume filenames that have been used in decoding the archive
-// up to this point. This will include the current open volume if the archive is still
-// being processed.
-func (rc *ReadCloser) Volumes() []string {
-	return rc.v.files
-}
-
-// OpenReader opens a RAR archive specified by the name and returns a ReadCloser.
-func OpenReader(name, password string) (*ReadCloser, error) {
-	v, err := openVolume(name, password)
-	if err != nil {
-		return nil, err
-	}
-	rc := new(ReadCloser)
-	rc.v = v
-	rc.Reader.init(v)
-	return rc, nil
-}
diff --git a/vendor/github.com/nwaples/rardecode/LICENSE b/vendor/github.com/nwaples/rardecode/v2/LICENSE
similarity index 100%
rename from vendor/github.com/nwaples/rardecode/LICENSE
rename to vendor/github.com/nwaples/rardecode/v2/LICENSE
diff --git a/vendor/github.com/nwaples/rardecode/README.md b/vendor/github.com/nwaples/rardecode/v2/README.md
similarity index 53%
rename from vendor/github.com/nwaples/rardecode/README.md
rename to vendor/github.com/nwaples/rardecode/v2/README.md
index 513464c251..719e47c519 100644
--- a/vendor/github.com/nwaples/rardecode/README.md
+++ b/vendor/github.com/nwaples/rardecode/v2/README.md
@@ -1,4 +1,5 @@
 # rardecode
 [![GoDoc](https://godoc.org/github.com/nwaples/rardecode?status.svg)](https://godoc.org/github.com/nwaples/rardecode)
+[![Go Report Card](https://goreportcard.com/badge/github.com/nwaples/rardecode/v2)](https://goreportcard.com/report/github.com/nwaples/rardecode/v2)
 
 A go package for reading RAR archives.
diff --git a/vendor/github.com/nwaples/rardecode/v2/archive.go b/vendor/github.com/nwaples/rardecode/v2/archive.go
new file mode 100644
index 0000000000..3984dd2a4c
--- /dev/null
+++ b/vendor/github.com/nwaples/rardecode/v2/archive.go
@@ -0,0 +1,130 @@
+package rardecode
+
+import (
+	"errors"
+	"hash"
+)
+
+const (
+	_ = iota
+	decode20Ver
+	decode29Ver
+	decode50Ver
+	decode70Ver
+)
+
+var (
+	ErrCorruptBlockHeader    = errors.New("rardecode: corrupt block header")
+	ErrCorruptFileHeader     = errors.New("rardecode: corrupt file header")
+	ErrBadHeaderCRC          = errors.New("rardecode: bad header crc")
+	ErrUnknownDecoder        = errors.New("rardecode: unknown decoder version")
+	ErrDecoderOutOfData      = errors.New("rardecode: decoder expected more data than is in packed file")
+	ErrArchiveEncrypted      = errors.New("rardecode: archive encrypted, password required")
+	ErrArchivedFileEncrypted = errors.New("rardecode: archived files encrypted, password required")
+)
+
+type readBuf []byte
+
+func (b *readBuf) byte() byte {
+	v := (*b)[0]
+	*b = (*b)[1:]
+	return v
+}
+
+func (b *readBuf) uint16() uint16 {
+	v := uint16((*b)[0]) | uint16((*b)[1])<<8
+	*b = (*b)[2:]
+	return v
+}
+
+func (b *readBuf) uint32() uint32 {
+	v := uint32((*b)[0]) | uint32((*b)[1])<<8 | uint32((*b)[2])<<16 | uint32((*b)[3])<<24
+	*b = (*b)[4:]
+	return v
+}
+
+func (b *readBuf) uint64() uint64 {
+	v := uint64((*b)[0]) | uint64((*b)[1])<<8 | uint64((*b)[2])<<16 | uint64((*b)[3])<<24 |
+		uint64((*b)[4])<<32 | uint64((*b)[5])<<40 | uint64((*b)[6])<<48 | uint64((*b)[7])<<56
+	*b = (*b)[8:]
+	return v
+}
+
+func (b *readBuf) bytes(n int) []byte {
+	v := (*b)[:n]
+	*b = (*b)[n:]
+	return v
+}
+
+func (b *readBuf) uvarint() uint64 {
+	var x uint64
+	var s uint
+	for i, n := range *b {
+		if n < 0x80 {
+			*b = (*b)[i+1:]
+			return x | uint64(n)<<s
+		}
+		x |= uint64(n&0x7f) << s
+		s += 7
+
+	}
+	// if we run out of bytes, just return 0
+	*b = (*b)[len(*b):]
+	return 0
+}
+
+// sliceReader implements the readSlice and peek functions.
+// The slices returned are only valid till the next readSlice or peek call.
+// If n bytes arent available no slice will be returned with the error value set.
+// The error is io.EOF only of 0 bytes were found, otherwise io.ErrUnexpectedEOF
+// will be returned on a short read.
+// The capacity of the slice returned by readSlice must reflect how much data was read
+// to return the n bytes (eg. an encrypted reader has to decrypt in multiples of a
+// block size so may need to read more than n bytes).
+type sliceReader interface {
+	readSlice(n int) ([]byte, error) // return the next n bytes
+	peek(n int) ([]byte, error)      // return the next n bytes withough advancing reader
+}
+
+// fileBlockHeader represents a file block in a RAR archive.
+// Files may comprise one or more file blocks.
+// Solid files retain decode tables and dictionary from previous solid files in the archive.
+type fileBlockHeader struct {
+	first    bool             // first block in file
+	last     bool             // last block in file
+	arcSolid bool             // archive is solid
+	winSize  int              // decode window size
+	hash     func() hash.Hash // hash used for file checksum
+	hashKey  []byte           // optional hmac key to be used calculate file checksum
+	sum      []byte           // expected checksum for file contents
+	decVer   int              // decoder to use for file
+	key      []byte           // key for AES, non-empty if file encrypted
+	iv       []byte           // iv for AES, non-empty if file encrypted
+	genKeys  func() error     // generates key & iv fields
+	FileHeader
+}
+
+// fileBlockReader returns the next fileBlockHeader in a volume.
+type fileBlockReader interface {
+	next(v *volume) (*fileBlockHeader, error) // reads the volume and returns the next fileBlockHeader
+	clone() fileBlockReader                   // makes a copy of the fileBlockReader
+}
+
+func newFileBlockReader(v *volume) (fileBlockReader, error) {
+	pass := v.opt.pass
+	if pass != nil {
+		runes := []rune(*pass)
+		if len(runes) > maxPassword {
+			pw := string(runes[:maxPassword])
+			pass = &pw
+		}
+	}
+	switch v.ver {
+	case 0:
+		return newArchive15(pass), nil
+	case 1:
+		return newArchive50(pass), nil
+	default:
+		return nil, ErrUnknownVersion
+	}
+}
diff --git a/vendor/github.com/nwaples/rardecode/archive15.go b/vendor/github.com/nwaples/rardecode/v2/archive15.go
similarity index 70%
rename from vendor/github.com/nwaples/rardecode/archive15.go
rename to vendor/github.com/nwaples/rardecode/v2/archive15.go
index 260176c06b..c1ebc24c28 100644
--- a/vendor/github.com/nwaples/rardecode/archive15.go
+++ b/vendor/github.com/nwaples/rardecode/v2/archive15.go
@@ -1,14 +1,12 @@
 package rardecode
 
 import (
-	"bufio"
 	"bytes"
 	"crypto/sha1"
 	"errors"
-	"hash"
 	"hash/crc32"
 	"io"
-	"io/ioutil"
+	"os"
 	"strconv"
 	"strings"
 	"time"
@@ -19,6 +17,7 @@ const (
 	// block types
 	blockArc     = 0x73
 	blockFile    = 0x74
+	blockComment = 0x75
 	blockService = 0x7a
 	blockEnd     = 0x7b
 
@@ -27,6 +26,7 @@ const (
 
 	// archive block flags
 	arcVolume    = 0x0001
+	arcComment   = 0x0002
 	arcSolid     = 0x0008
 	arcNewNaming = 0x0010
 	arcEncrypted = 0x0080
@@ -52,7 +52,7 @@ const (
 )
 
 var (
-	errMultipleDecoders = errors.New("rardecode: multiple decoders in a single archive not supported")
+	ErrUnsupportedDecoder = errors.New("rardecode: unsupported decoder version")
 )
 
 type blockHeader15 struct {
@@ -62,36 +62,25 @@ type blockHeader15 struct {
 	dataSize int64   // size of extra block data
 }
 
-// fileHash32 implements fileChecksum for 32-bit hashes
-type fileHash32 struct {
-	hash.Hash32        // hash to write file contents to
-	sum         uint32 // 32bit checksum for file
-}
-
-func (h *fileHash32) valid() bool {
-	return h.sum == h.Sum32()
-}
-
 // archive15 implements fileBlockReader for RAR 1.5 file format archives
 type archive15 struct {
-	byteReader               // reader for current block data
-	v          *bufio.Reader // reader for current archive volume
-	dec        decoder       // current decoder
-	decVer     byte          // current decoder version
-	multi      bool          // archive is multi-volume
-	old        bool          // archive uses old naming scheme
-	solid      bool          // archive is a solid archive
-	encrypted  bool
-	pass       []uint16              // password in UTF-16
-	checksum   fileHash32            // file checksum
-	buf        readBuf               // temporary buffer
-	keyCache   [cacheSize30]struct { // cache of previously calculated decryption keys
+	multi     bool // archive is multi-volume
+	solid     bool // archive is a solid archive
+	encrypted bool
+	pass      []uint16              // password in UTF-16
+	keyCache  [cacheSize30]struct { // cache of previously calculated decryption keys
 		salt []byte
 		key  []byte
 		iv   []byte
 	}
 }
 
+func (a *archive15) clone() fileBlockReader {
+	na := new(archive15)
+	*na = *a
+	return na
+}
+
 // Calculates the key and iv for AES decryption given a password and salt.
 func calcAes30Params(pass []uint16, salt []byte) (key, iv []byte) {
 	p := make([]byte, 0, len(pass)*2+len(salt))
@@ -102,10 +91,13 @@ func calcAes30Params(pass []uint16, salt []byte) (key, iv []byte) {
 
 	hash := sha1.New()
 	iv = make([]byte, 16)
-	s := make([]byte, 0, hash.Size())
+	s := make([]byte, hash.Size())
+	b := s[:3]
 	for i := 0; i < hashRounds; i++ {
-		hash.Write(p)
-		hash.Write([]byte{byte(i), byte(i >> 8), byte(i >> 16)})
+		// ignore hash Write errors, should always succeed
+		_, _ = hash.Write(p)
+		b[0], b[1], b[2] = byte(i), byte(i>>8), byte(i>>16)
+		_, _ = hash.Write(b)
 		if i%(hashRounds/16) == 0 {
 			s = hash.Sum(s[:0])
 			iv[i/(hashRounds/16)] = s[4*4+3]
@@ -256,15 +248,18 @@ func (a *archive15) parseFileHeader(h *blockHeader15) (*fileBlockHeader, error)
 	f.first = h.flags&fileSplitBefore == 0
 	f.last = h.flags&fileSplitAfter == 0
 
-	f.solid = h.flags&fileSolid > 0
+	f.Solid = h.flags&fileSolid > 0
+	f.arcSolid = a.solid
+	f.Encrypted = h.flags&fileEncrypted > 0
+	f.HeaderEncrypted = a.encrypted
 	f.IsDir = h.flags&fileWindowMask == fileWindowMask
 	if !f.IsDir {
-		f.winSize = uint(h.flags&fileWindowMask)>>5 + 16
+		f.winSize = 0x10000 << ((h.flags & fileWindowMask) >> 5)
 	}
 
 	b := h.data
 	if len(b) < 21 {
-		return nil, errCorruptFileHeader
+		return nil, ErrCorruptFileHeader
 	}
 
 	f.PackedSize = h.dataSize
@@ -273,7 +268,7 @@ func (a *archive15) parseFileHeader(h *blockHeader15) (*fileBlockHeader, error)
 	if f.HostOS > HostOSBeOS {
 		f.HostOS = HostOSUnknown
 	}
-	a.checksum.sum = b.uint32()
+	f.sum = append([]byte(nil), b.bytes(4)...)
 
 	f.ModificationTime = parseDosTime(b.uint32())
 	unpackver := b.byte()     // decoder version
@@ -282,7 +277,7 @@ func (a *archive15) parseFileHeader(h *blockHeader15) (*fileBlockHeader, error)
 	f.Attributes = int64(b.uint32())
 	if h.flags&fileLargeData > 0 {
 		if len(b) < 8 {
-			return nil, errCorruptFileHeader
+			return nil, ErrCorruptFileHeader
 		}
 		_ = b.uint32() // already read large PackedSize in readBlockHeader
 		f.UnPackedSize |= int64(b.uint32()) << 32
@@ -292,7 +287,7 @@ func (a *archive15) parseFileHeader(h *blockHeader15) (*fileBlockHeader, error)
 		f.UnPackedSize = -1
 	}
 	if len(b) < namesize {
-		return nil, errCorruptFileHeader
+		return nil, ErrCorruptFileHeader
 	}
 	name := b.bytes(namesize)
 	if h.flags&fileUnicode == 0 {
@@ -318,9 +313,9 @@ func (a *archive15) parseFileHeader(h *blockHeader15) (*fileBlockHeader, error)
 	var salt []byte
 	if h.flags&fileSalt > 0 {
 		if len(b) < saltSize {
-			return nil, errCorruptFileHeader
+			return nil, ErrCorruptFileHeader
 		}
-		salt = b.bytes(saltSize)
+		salt = append([]byte(nil), b.bytes(saltSize)...)
 	}
 	if h.flags&fileExtTime > 0 {
 		readExtTimes(f, &b)
@@ -331,83 +326,97 @@ func (a *archive15) parseFileHeader(h *blockHeader15) (*fileBlockHeader, error)
 	}
 	// fields only needed for first block in a file
 	if h.flags&fileEncrypted > 0 && len(salt) == saltSize {
-		f.key, f.iv = a.getKeys(salt)
-	}
-	a.checksum.Reset()
-	f.cksum = &a.checksum
-	if method == 0 {
-		return f, nil
+		f.genKeys = func() error {
+			if a.pass == nil {
+				return ErrArchivedFileEncrypted
+			}
+			f.key, f.iv = a.getKeys(salt)
+			return nil
+		}
 	}
-	if a.dec == nil {
+	f.hash = newLittleEndianCRC32
+	if method != 0 {
 		switch unpackver {
-		case 15, 20, 26:
-			return nil, errUnsupportedDecoder
+		case 15:
+			return nil, ErrUnsupportedDecoder
+		case 20, 26:
+			f.decVer = decode20Ver
 		case 29:
-			a.dec = new(decoder29)
+			f.decVer = decode29Ver
 		default:
-			return nil, errUnknownDecoder
+			return nil, ErrUnknownDecoder
 		}
-		a.decVer = unpackver
-	} else if a.decVer != unpackver {
-		return nil, errMultipleDecoders
 	}
-	f.decoder = a.dec
 	return f, nil
 }
 
 // readBlockHeader returns the next block header in the archive.
 // It will return io.EOF if there were no bytes read.
-func (a *archive15) readBlockHeader() (*blockHeader15, error) {
-	var err error
-	b := a.buf[:7]
-	r := io.Reader(a.v)
+func (a *archive15) readBlockHeader(r sliceReader) (*blockHeader15, error) {
 	if a.encrypted {
-		salt := a.buf[:saltSize]
-		_, err = io.ReadFull(r, salt)
+		if a.pass == nil {
+			return nil, ErrArchiveEncrypted
+		}
+		salt, err := r.readSlice(saltSize)
 		if err != nil {
 			return nil, err
 		}
 		key, iv := a.getKeys(salt)
-		r = newAesDecryptReader(r, key, iv)
-		err = readFull(r, b)
-	} else {
-		_, err = io.ReadFull(r, b)
+		r = newAesSliceReader(r, key, iv)
 	}
+	var b readBuf
+	var err error
+	// peek to find the header size
+	b, err = r.peek(7)
 	if err != nil {
+		if err == io.EOF && a.encrypted {
+			err = io.ErrUnexpectedEOF
+		}
 		return nil, err
 	}
-
 	crc := b.uint16()
-	hash := crc32.NewIEEE()
-	hash.Write(b)
 	h := new(blockHeader15)
 	h.htype = b.byte()
 	h.flags = b.uint16()
-	size := b.uint16()
-	if size < 7 {
-		return nil, errCorruptHeader
-	}
-	size -= 7
-	if int(size) > cap(a.buf) {
-		a.buf = readBuf(make([]byte, size))
+	size := int(b.uint16())
+	if h.htype == blockArc && h.flags&arcComment > 0 {
+		// comment block embedded into archive block
+		if size < 13 {
+			return nil, ErrCorruptBlockHeader
+		}
+		size = 13
+	} else if size < 7 {
+		return nil, ErrCorruptBlockHeader
 	}
-	h.data = a.buf[:size]
-	if err := readFull(r, h.data); err != nil {
+	h.data, err = r.readSlice(size)
+	if err != nil {
+		if err == io.EOF {
+			err = io.ErrUnexpectedEOF
+		}
 		return nil, err
 	}
-	hash.Write(h.data)
+	hash := crc32.NewIEEE()
+	if h.htype == blockComment {
+		if size < 13 {
+			return nil, ErrCorruptBlockHeader
+		}
+		_, _ = hash.Write(h.data[2:13])
+	} else {
+		_, _ = hash.Write(h.data[2:])
+	}
 	if crc != uint16(hash.Sum32()) {
-		return nil, errBadHeaderCrc
+		return nil, ErrBadHeaderCRC
 	}
+	h.data = h.data[7:]
 	if h.flags&blockHasData > 0 {
 		if len(h.data) < 4 {
-			return nil, errCorruptHeader
+			return nil, ErrCorruptBlockHeader
 		}
 		h.dataSize = int64(h.data.uint32())
 	}
 	if (h.htype == blockService || h.htype == blockFile) && h.flags&fileLargeData > 0 {
 		if len(h.data) < 25 {
-			return nil, errCorruptHeader
+			return nil, ErrCorruptBlockHeader
 		}
 		b := h.data[21:25]
 		h.dataSize |= int64(b.uint32()) << 32
@@ -416,30 +425,45 @@ func (a *archive15) readBlockHeader() (*blockHeader15, error) {
 }
 
 // next advances to the next file block in the archive
-func (a *archive15) next() (*fileBlockHeader, error) {
+func (a *archive15) next(v *volume) (*fileBlockHeader, error) {
 	for {
 		// could return an io.EOF here as 1.5 archives may not have an end block.
-		h, err := a.readBlockHeader()
+		h, err := a.readBlockHeader(v)
 		if err != nil {
+			// if reached end of file without an end block try to open next volume
+			if err == io.EOF {
+				a.encrypted = false // reset encryption when opening new volume file
+				err = v.next()
+				if err == nil {
+					continue
+				}
+				// new volume doesnt exist, assume end of archive
+				if os.IsNotExist(err) {
+					return nil, io.EOF
+				}
+			}
 			return nil, err
 		}
-		a.byteReader = limitByteReader(a.v, h.dataSize) // reader for block data
-
 		switch h.htype {
 		case blockFile:
 			return a.parseFileHeader(h)
 		case blockArc:
 			a.encrypted = h.flags&arcEncrypted > 0
 			a.multi = h.flags&arcVolume > 0
-			a.old = h.flags&arcNewNaming == 0
+			if v.num == 0 {
+				v.old = h.flags&arcNewNaming == 0
+			}
 			a.solid = h.flags&arcSolid > 0
 		case blockEnd:
 			if h.flags&endArcNotLast == 0 || !a.multi {
-				return nil, errArchiveEnd
+				return nil, io.EOF
 			}
-			return nil, errArchiveContinues
+			a.encrypted = false // reset encryption when opening new volume file
+			err = v.next()
 		default:
-			_, err = io.Copy(ioutil.Discard, a.byteReader)
+			if h.dataSize > 0 {
+				err = v.discard(h.dataSize) // skip over block data
+			}
 		}
 		if err != nil {
 			return nil, err
@@ -447,22 +471,11 @@ func (a *archive15) next() (*fileBlockHeader, error) {
 	}
 }
 
-func (a *archive15) version() int { return fileFmt15 }
-
-func (a *archive15) reset() {
-	a.encrypted = false // reset encryption when opening new volume file
-}
-
-func (a *archive15) isSolid() bool {
-	return a.solid
-}
-
 // newArchive15 creates a new fileBlockReader for a Version 1.5 archive
-func newArchive15(r *bufio.Reader, password string) fileBlockReader {
+func newArchive15(password *string) *archive15 {
 	a := new(archive15)
-	a.v = r
-	a.pass = utf16.Encode([]rune(password)) // convert to UTF-16
-	a.checksum.Hash32 = crc32.NewIEEE()
-	a.buf = readBuf(make([]byte, 100))
+	if password != nil {
+		a.pass = utf16.Encode([]rune(*password)) // convert to UTF-16
+	}
 	return a
 }
diff --git a/vendor/github.com/nwaples/rardecode/v2/archive50.go b/vendor/github.com/nwaples/rardecode/v2/archive50.go
new file mode 100644
index 0000000000..98db247363
--- /dev/null
+++ b/vendor/github.com/nwaples/rardecode/v2/archive50.go
@@ -0,0 +1,583 @@
+package rardecode
+
+import (
+	"bytes"
+	"crypto/hmac"
+	"crypto/sha256"
+	"errors"
+	"hash"
+	"hash/crc32"
+	"io"
+	"math"
+	"math/bits"
+	"time"
+)
+
+const (
+	// block types
+	block5Arc  = 1
+	block5File = 2
+	// block5Service = 3
+	block5Encrypt = 4
+	block5End     = 5
+
+	// block flags
+	block5HasExtra     = 0x0001
+	block5HasData      = 0x0002
+	block5DataNotFirst = 0x0008
+	block5DataNotLast  = 0x0010
+
+	// end block flags
+	endArc5NotLast = 0x0001
+
+	// archive encryption block flags
+	enc5CheckPresent = 0x0001 // password check data is present
+
+	// main archive block flags
+	arc5MultiVol = 0x0001
+	arc5Solid    = 0x0004
+
+	// file block flags
+	file5IsDir          = 0x0001
+	file5HasUnixMtime   = 0x0002
+	file5HasCRC32       = 0x0004
+	file5UnpSizeUnknown = 0x0008
+
+	// file compression flags
+	file5CompAlgorithm = 0x0000003F
+	file5CompSolid     = 0x00000040
+	file5CompMethod    = 0x00000380
+	file5CompDictSize  = 0x00007C00
+	file5CompDictFract = 0x000F8000
+	file5CompV5Compat  = 0x00100000
+
+	// file encryption record flags
+	file5EncCheckPresent = 0x0001 // password check data is present
+	file5EncUseMac       = 0x0002 // use MAC instead of plain checksum
+
+	// precision time flags
+	file5ExtraTimeIsUnixTime = 0x01 // is unix time_t
+	file5ExtraTimeHasMTime   = 0x02 // has modification time
+	file5ExtraTimeHasCTime   = 0x04 // has creation time
+	file5ExtraTimeHasATime   = 0x08 // has access time
+	file5ExtraTimeHasUnixNS  = 0x10 // unix nanosecond time format
+
+	cacheSize50   = 4
+	maxPbkdf2Salt = 64
+	pwCheckSize   = 8
+	maxKdfCount   = 24
+
+	maxDictSize = 0x1000000000 // maximum dictionary size 64GB
+)
+
+var (
+	ErrBadPassword          = errors.New("rardecode: incorrect password")
+	ErrCorruptEncryptData   = errors.New("rardecode: corrupt encryption data")
+	ErrUnknownEncryptMethod = errors.New("rardecode: unknown encryption method")
+	ErrPlatformIntSize      = errors.New("rardecode: platform integer size too small")
+	ErrDictionaryTooLarge   = errors.New("rardecode: decode dictionary too large")
+)
+
+type extra struct {
+	ftype uint64  // field type
+	data  readBuf // field data
+}
+
+type blockHeader50 struct {
+	htype    uint64 // block type
+	flags    uint64
+	data     readBuf // block header data
+	extra    []extra // extra fields
+	dataSize int64   // size of block data
+}
+
+// leHash32 wraps a hash.Hash32 to return the result of Sum in little
+// endian format.
+type leHash32 struct {
+	hash.Hash32
+}
+
+func (h leHash32) Sum(b []byte) []byte {
+	s := h.Sum32()
+	return append(b, byte(s), byte(s>>8), byte(s>>16), byte(s>>24))
+}
+
+func newLittleEndianCRC32() hash.Hash {
+	return leHash32{crc32.NewIEEE()}
+}
+
+// archive50 implements fileBlockReader for RAR 5 file format archives
+type archive50 struct {
+	pass     []byte
+	blockKey []byte                // key used to encrypt blocks
+	multi    bool                  // archive is multi-volume
+	solid    bool                  // is a solid archive
+	keyCache [cacheSize50]struct { // encryption key cache
+		kdfCount int
+		salt     []byte
+		keys     [][]byte
+	}
+}
+
+func (a *archive50) clone() fileBlockReader {
+	na := new(archive50)
+	*na = *a
+	return na
+}
+
+// calcKeys50 calculates the keys used in RAR 5 archive processing.
+// The returned slice of byte slices contains 3 keys.
+// Key 0 is used for block or file decryption.
+// Key 1 is optionally used for file checksum calculation.
+// Key 2 is optionally used for password checking.
+func calcKeys50(pass, salt []byte, kdfCount int) [][]byte {
+	if len(salt) > maxPbkdf2Salt {
+		salt = salt[:maxPbkdf2Salt]
+	}
+	keys := make([][]byte, 3)
+	if len(keys) == 0 {
+		return keys
+	}
+
+	prf := hmac.New(sha256.New, pass)
+	_, _ = prf.Write(salt)
+	_, _ = prf.Write([]byte{0, 0, 0, 1})
+
+	t := prf.Sum(nil)
+	u := append([]byte(nil), t...)
+
+	kdfCount--
+
+	for i, iter := range []int{kdfCount, 16, 16} {
+		for iter > 0 {
+			prf.Reset()
+			_, _ = prf.Write(u)
+			u = prf.Sum(u[:0])
+			for j := range u {
+				t[j] ^= u[j]
+			}
+			iter--
+		}
+		keys[i] = append([]byte(nil), t...)
+	}
+
+	pwcheck := keys[2]
+	for i, v := range pwcheck[pwCheckSize:] {
+		pwcheck[i&(pwCheckSize-1)] ^= v
+	}
+	pwcheck = pwcheck[:pwCheckSize]
+	// add checksum to end of pwcheck
+	sum := sha256.Sum256(pwcheck)
+	pwcheck = append(pwcheck, sum[:4]...)
+	keys[2] = pwcheck
+
+	return keys
+}
+
+// getKeys returns the the corresponding encryption keys for the given kdfcount and salt.
+// It will check the password if check is provided.
+func (a *archive50) getKeys(kdfCount int, salt, check []byte) ([][]byte, error) {
+	var keys [][]byte
+
+	if kdfCount > maxKdfCount {
+		return nil, ErrCorruptEncryptData
+	}
+	kdfCount = 1 << uint(kdfCount)
+
+	// check cache of keys for match
+	for _, v := range a.keyCache {
+		if kdfCount == v.kdfCount && bytes.Equal(salt, v.salt) {
+			keys = v.keys
+			break
+		}
+	}
+	if keys == nil {
+		// not found, calculate keys
+		keys = calcKeys50(a.pass, salt, kdfCount)
+
+		// store in cache
+		copy(a.keyCache[1:], a.keyCache[:])
+		a.keyCache[0].kdfCount = kdfCount
+		a.keyCache[0].salt = append([]byte(nil), salt...)
+		a.keyCache[0].keys = keys
+	}
+
+	// check password
+	if check != nil && !bytes.Equal(check, keys[2]) {
+		return nil, ErrBadPassword
+	}
+	return keys, nil
+}
+
+// parseFileEncryptionRecord processes the optional file encryption record from a file header.
+func (a *archive50) parseFileEncryptionRecord(b readBuf, f *fileBlockHeader) error {
+	f.Encrypted = true
+	if ver := b.uvarint(); ver != 0 {
+		return ErrUnknownEncryptMethod
+	}
+	flags := b.uvarint()
+	if len(b) < 33 {
+		return ErrCorruptEncryptData
+	}
+	kdfCount := int(b.byte())
+	salt := append([]byte(nil), b.bytes(16)...)
+	f.iv = append([]byte(nil), b.bytes(16)...)
+
+	var check []byte
+	if flags&file5EncCheckPresent > 0 {
+		if len(b) < 12 {
+			return ErrCorruptEncryptData
+		}
+		check = append([]byte(nil), b.bytes(12)...)
+	}
+	useMac := flags&file5EncUseMac > 0
+	// only need to generate keys for first block or
+	// last block if it has an optional hash key
+	if !(f.first || (f.last && useMac)) {
+		return nil
+	}
+	f.genKeys = func() error {
+		if a.pass == nil {
+			return ErrArchivedFileEncrypted
+		}
+		keys, err := a.getKeys(kdfCount, salt, check)
+		if err != nil {
+			return err
+		}
+
+		f.key = keys[0]
+		if useMac {
+			f.hashKey = keys[1]
+		}
+		return nil
+	}
+	return nil
+}
+
+func readWinFiletime(b *readBuf) (time.Time, error) {
+	if len(*b) < 8 {
+		return time.Time{}, ErrCorruptFileHeader
+	}
+	// 100-nanosecond intervals since January 1, 1601
+	t := b.uint64() - 116444736000000000
+	t *= 100
+	sec, nsec := bits.Div64(0, t, uint64(time.Second))
+	return time.Unix(int64(sec), int64(nsec)), nil
+}
+
+func readUnixTime(b *readBuf) (time.Time, error) {
+	if len(*b) < 4 {
+		return time.Time{}, ErrCorruptFileHeader
+	}
+	return time.Unix(int64(b.uint32()), 0), nil
+}
+
+func readUnixNanoseconds(b *readBuf) (time.Duration, error) {
+	if len(*b) < 4 {
+		return 0, ErrCorruptFileHeader
+	}
+	d := time.Duration(b.uint32() & 0x3fffffff)
+	if d >= time.Second {
+		return 0, ErrCorruptFileHeader
+	}
+	return d, nil
+}
+
+// parseFilePrecisionTimeRecord processes the optional high precision time record from a file header.
+func (a *archive50) parseFilePrecisionTimeRecord(b *readBuf, f *fileBlockHeader) error {
+	var err error
+	flags := b.uvarint()
+	isUnixTime := flags&file5ExtraTimeIsUnixTime > 0
+	if flags&file5ExtraTimeHasMTime > 0 {
+		if isUnixTime {
+			f.ModificationTime, err = readUnixTime(b)
+		} else {
+			f.ModificationTime, err = readWinFiletime(b)
+		}
+		if err != nil {
+			return err
+		}
+	}
+	if flags&file5ExtraTimeHasCTime > 0 {
+		if isUnixTime {
+			f.CreationTime, err = readUnixTime(b)
+		} else {
+			f.CreationTime, err = readWinFiletime(b)
+		}
+		if err != nil {
+			return err
+		}
+	}
+	if flags&file5ExtraTimeHasATime > 0 {
+		if isUnixTime {
+			f.AccessTime, err = readUnixTime(b)
+		} else {
+			f.AccessTime, err = readWinFiletime(b)
+		}
+		if err != nil {
+			return err
+		}
+	}
+	if isUnixTime && flags&file5ExtraTimeHasUnixNS > 0 {
+		if flags&file5ExtraTimeHasMTime > 0 {
+			ns, err := readUnixNanoseconds(b)
+			if err != nil {
+				return err
+			}
+			f.ModificationTime = f.ModificationTime.Add(ns)
+		}
+		if flags&file5ExtraTimeHasCTime > 0 {
+			ns, err := readUnixNanoseconds(b)
+			if err != nil {
+				return err
+			}
+			f.CreationTime = f.CreationTime.Add(ns)
+		}
+		if flags&file5ExtraTimeHasATime > 0 {
+			ns, err := readUnixNanoseconds(b)
+			if err != nil {
+				return err
+			}
+			f.AccessTime = f.AccessTime.Add(ns)
+		}
+	}
+	return nil
+}
+
+func (a *archive50) parseFileHeader(h *blockHeader50) (*fileBlockHeader, error) {
+	f := new(fileBlockHeader)
+
+	f.HeaderEncrypted = a.blockKey != nil
+	f.first = h.flags&block5DataNotFirst == 0
+	f.last = h.flags&block5DataNotLast == 0
+
+	flags := h.data.uvarint() // file flags
+	f.IsDir = flags&file5IsDir > 0
+	f.UnKnownSize = flags&file5UnpSizeUnknown > 0
+	f.UnPackedSize = int64(h.data.uvarint())
+	f.PackedSize = h.dataSize
+	f.Attributes = int64(h.data.uvarint())
+	if flags&file5HasUnixMtime > 0 {
+		if len(h.data) < 4 {
+			return nil, ErrCorruptFileHeader
+		}
+		f.ModificationTime = time.Unix(int64(h.data.uint32()), 0)
+	}
+	if flags&file5HasCRC32 > 0 {
+		if len(h.data) < 4 {
+			return nil, ErrCorruptFileHeader
+		}
+		f.sum = append([]byte(nil), h.data.bytes(4)...)
+		if f.first {
+			f.hash = newLittleEndianCRC32
+		}
+	}
+
+	flags = h.data.uvarint() // compression flags
+	f.Solid = flags&file5CompSolid > 0
+	f.arcSolid = a.solid
+	method := (flags >> 7) & 7 // compression method (0 == none)
+	if f.first && method != 0 {
+		unpackver := flags & file5CompAlgorithm
+		var winSize int64
+		if unpackver == 0 {
+			f.decVer = decode50Ver
+			winSize = 0x20000 << ((flags >> 10) & 0x0F)
+		} else if unpackver == 1 {
+			if flags&file5CompV5Compat > 0 {
+				f.decVer = decode50Ver
+			} else {
+				f.decVer = decode70Ver
+			}
+			winSize = 0x20000 << ((flags >> 10) & 0x1F)
+			winSize += winSize / 32 * int64((flags>>15)&0x1F)
+			if winSize > maxDictSize {
+				return nil, ErrDictionaryTooLarge
+			}
+		} else {
+			return nil, ErrUnknownDecoder
+		}
+		if winSize > math.MaxInt {
+			return nil, ErrPlatformIntSize
+		}
+		f.winSize = int(winSize)
+	}
+	switch h.data.uvarint() {
+	case 0:
+		f.HostOS = HostOSWindows
+	case 1:
+		f.HostOS = HostOSUnix
+	default:
+		f.HostOS = HostOSUnknown
+	}
+	nlen := int(h.data.uvarint())
+	if len(h.data) < nlen {
+		return nil, ErrCorruptFileHeader
+	}
+	f.Name = string(h.data.bytes(nlen))
+
+	// parse optional extra records
+	for _, e := range h.extra {
+		var err error
+		switch e.ftype {
+		case 1: // encryption
+			err = a.parseFileEncryptionRecord(e.data, f)
+		case 2:
+			// TODO: hash
+		case 3:
+			err = a.parseFilePrecisionTimeRecord(&e.data, f)
+		case 4: // version
+			_ = e.data.uvarint() // ignore flags field
+			f.Version = int(e.data.uvarint())
+		case 5:
+			// TODO: redirection
+		case 6:
+			// TODO: owner
+		}
+		if err != nil {
+			return nil, err
+		}
+	}
+	return f, nil
+}
+
+// parseEncryptionBlock calculates the key for block encryption.
+func (a *archive50) parseEncryptionBlock(b readBuf) error {
+	if a.pass == nil {
+		return ErrArchiveEncrypted
+	}
+	if ver := b.uvarint(); ver != 0 {
+		return ErrUnknownEncryptMethod
+	}
+	flags := b.uvarint()
+	if len(b) < 17 {
+		return ErrCorruptEncryptData
+	}
+	kdfCount := int(b.byte())
+	salt := b.bytes(16)
+
+	var check []byte
+	if flags&enc5CheckPresent > 0 {
+		if len(b) < 12 {
+			return ErrCorruptEncryptData
+		}
+		check = b.bytes(12)
+	}
+
+	keys, err := a.getKeys(kdfCount, salt, check)
+	if err != nil {
+		return err
+	}
+	a.blockKey = keys[0]
+	return nil
+}
+
+func (a *archive50) readBlockHeader(r sliceReader) (*blockHeader50, error) {
+	if a.blockKey != nil {
+		// block is encrypted
+		iv, err := r.readSlice(16)
+		if err != nil {
+			return nil, err
+		}
+		r = newAesSliceReader(r, a.blockKey, iv)
+	}
+	var b readBuf
+	var err error
+	// peek to find the header size
+	b, err = r.peek(7)
+	if err != nil {
+		return nil, err
+	}
+	crc := b.uint32()
+
+	hash := crc32.NewIEEE()
+
+	size := int(b.uvarint()) // header size
+	b, err = r.readSlice(7 - len(b) + size)
+	if err != nil {
+		return nil, err
+	}
+
+	// check header crc
+	_, _ = hash.Write(b[4:])
+	if crc != hash.Sum32() {
+		return nil, ErrBadHeaderCRC
+	}
+
+	b = b[len(b)-size:]
+	h := new(blockHeader50)
+	h.htype = b.uvarint()
+	h.flags = b.uvarint()
+
+	var extraSize int
+	if h.flags&block5HasExtra > 0 {
+		extraSize = int(b.uvarint())
+	}
+	if h.flags&block5HasData > 0 {
+		h.dataSize = int64(b.uvarint())
+	}
+	if len(b) < extraSize {
+		return nil, ErrCorruptBlockHeader
+	}
+	h.data = b.bytes(len(b) - extraSize)
+
+	// read header extra records
+	for len(b) > 0 {
+		size = int(b.uvarint())
+		if len(b) < size {
+			return nil, ErrCorruptBlockHeader
+		}
+		data := readBuf(b.bytes(size))
+		ftype := data.uvarint()
+		h.extra = append(h.extra, extra{ftype, data})
+	}
+
+	return h, nil
+}
+
+// next advances to the next file block in the archive
+func (a *archive50) next(v *volume) (*fileBlockHeader, error) {
+	for {
+		// get next block header
+		h, err := a.readBlockHeader(v)
+		if err != nil {
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+			return nil, err
+		}
+		switch h.htype {
+		case block5File:
+			return a.parseFileHeader(h)
+		case block5Arc:
+			flags := h.data.uvarint()
+			a.multi = flags&arc5MultiVol > 0
+			a.solid = flags&arc5Solid > 0
+		case block5Encrypt:
+			err = a.parseEncryptionBlock(h.data)
+		case block5End:
+			flags := h.data.uvarint()
+			if flags&endArc5NotLast == 0 || !a.multi {
+				return nil, io.EOF
+			}
+			a.blockKey = nil // reset encryption when opening new volume file
+			err = v.next()
+		default:
+			if h.dataSize > 0 {
+				err = v.discard(h.dataSize) // skip over block data
+			}
+		}
+		if err != nil {
+			return nil, err
+		}
+	}
+}
+
+// newArchive50 creates a new fileBlockReader for a Version 5 archive.
+func newArchive50(password *string) *archive50 {
+	a := new(archive50)
+	if password != nil {
+		a.pass = []byte(*password)
+	}
+	return a
+}
diff --git a/vendor/github.com/nwaples/rardecode/v2/bit_reader.go b/vendor/github.com/nwaples/rardecode/v2/bit_reader.go
new file mode 100644
index 0000000000..d0c931b21f
--- /dev/null
+++ b/vendor/github.com/nwaples/rardecode/v2/bit_reader.go
@@ -0,0 +1,259 @@
+package rardecode
+
+import (
+	"io"
+	"math/bits"
+)
+
+type bitReader interface {
+	readBits(n uint8) (int, error) // read n bits of data
+	unreadBits(n uint8)            // revert the reading of the last n bits read
+}
+
+// rar5BitReader is a bitReader that reads bytes from a byteReader and stops with io.EOF after l bits.
+type rar5BitReader struct {
+	r byteReader
+	v int    // cache of bits read from r
+	l int    // number of bits (not cached) that can be read from r
+	n uint8  // number of unread bits in v
+	b []byte // bytes() output cache from r
+}
+
+func (r *rar5BitReader) unreadBits(n uint8) { r.n += n }
+
+// ReadByte reads a byte from rar5BitReader's byteReader ignoring the bit cache v.
+func (r *rar5BitReader) ReadByte() (byte, error) {
+	if len(r.b) == 0 {
+		var err error
+		r.b, err = r.r.bytes()
+		if err != nil {
+			if err == io.EOF {
+				err = ErrDecoderOutOfData
+			}
+			return 0, err
+		}
+	}
+	c := r.b[0]
+	r.b = r.b[1:]
+	return c, nil
+}
+
+func (r *rar5BitReader) reset(br byteReader) {
+	r.r = br
+	r.b = nil
+}
+
+// setLimit sets the maximum bit count that can be read.
+func (r *rar5BitReader) setLimit(n int) {
+	r.l = n
+	r.n = 0
+}
+
+// readBits returns n bits from the underlying byteReader.
+// n must be less than integer size - 8.
+func (r *rar5BitReader) readBits(n uint8) (int, error) {
+	for n > r.n {
+		if r.l == 0 {
+			// reached bits limit
+			return 0, io.EOF
+		}
+		if len(r.b) == 0 {
+			var err error
+			r.b, err = r.r.bytes()
+			if err != nil {
+				if err == io.EOF {
+					// io.EOF before we reached bit limit
+					err = ErrDecoderOutOfData
+				}
+				return 0, err
+			}
+		}
+		// try to fit as many bits into r.v as possible
+		for len(r.b) > 0 && r.n <= bits.UintSize-8 {
+			r.v = r.v<<8 | int(r.b[0])
+			r.b = r.b[1:]
+			r.n += 8
+			r.l -= 8
+			if r.l <= 0 {
+				if r.l < 0 {
+					// overshot, discard the extra bits
+					bits := uint8(-r.l)
+					r.l = 0
+					r.v >>= bits
+					r.n -= bits
+				}
+				break
+			}
+		}
+	}
+	r.n -= n
+	return (r.v >> r.n) & ((1 << n) - 1), nil
+}
+
+// replaceByteReader is a byteReader that returns b on the first call to bytes()
+// and then replaces the byteReader at rp with r.
+type replaceByteReader struct {
+	rp *byteReader
+	r  byteReader
+	b  []byte
+}
+
+func (r *replaceByteReader) Read(p []byte) (int, error) { return 0, io.EOF }
+
+func (r *replaceByteReader) bytes() ([]byte, error) {
+	*r.rp = r.r
+	return r.b, nil
+}
+
+// rarBitReader wraps an io.ByteReader to perform various bit and byte
+// reading utility functions used in RAR file processing.
+type rarBitReader struct {
+	r byteReader
+	v int
+	n uint8
+	b []byte
+}
+
+func (r *rarBitReader) reset(br byteReader) {
+	r.r = br
+	r.n = 0
+	r.v = 0
+	r.b = nil
+}
+
+// unshiftBytes moves any bytes in rarBitReader bit cache back into a byte slice
+// and sets up byteReader's so that all bytes can now be read by ReadByte() without
+// going through the bit cache.
+func (r *rarBitReader) unshiftBytes() {
+	// no cached bits
+	if r.n == 0 {
+		return
+	}
+	// create and read byte slice for cached bits
+	b := make([]byte, r.n/8)
+	for i := len(b) - 1; i >= 0; i-- {
+		b[i] = byte(r.v)
+		r.v >>= 8
+	}
+	r.n = 0
+	// current bytes buffer empty, so store b and return
+	if len(r.b) == 0 {
+		r.b = b
+		return
+	}
+	// Put current bytes buffer and byteReader in a replaceByteReader and
+	// the unshifted bytes in the rarBitReader bytes buffer.
+	// When the bytes buffer is consumed, rarBitReader will call bytes()
+	// on replaceByteReader which will return the old bytes buffer and
+	// replace itself with the old byteReader in rarBitReader.
+	r.r = &replaceByteReader{rp: &r.r, r: r.r, b: r.b}
+	r.b = b
+}
+
+// readBits returns n bits from the underlying byteReader.
+// n must be less than integer size - 8.
+func (r *rarBitReader) readBits(n uint8) (int, error) {
+	for n > r.n {
+		if len(r.b) == 0 {
+			var err error
+			r.b, err = r.r.bytes()
+			if err != nil {
+				return 0, err
+			}
+		}
+		// try to fit as many bits into r.v as possible
+		for len(r.b) > 0 && r.n <= bits.UintSize-8 {
+			r.v = r.v<<8 | int(r.b[0])
+			r.b = r.b[1:]
+			r.n += 8
+		}
+	}
+	r.n -= n
+	return (r.v >> r.n) & ((1 << n) - 1), nil
+}
+
+func (r *rarBitReader) unreadBits(n uint8) {
+	r.n += n
+}
+
+// alignByte aligns the current bit reading input to the next byte boundary.
+func (r *rarBitReader) alignByte() {
+	r.n -= r.n % 8
+}
+
+// readUint32 reads a RAR V3 encoded uint32
+func (r *rarBitReader) readUint32() (uint32, error) {
+	n, err := r.readBits(2)
+	if err != nil {
+		return 0, err
+	}
+	if n != 1 {
+		if bits.UintSize == 32 {
+			if n == 3 {
+				// 32bit platforms may not be able to read 32 bits as r.v
+				// will need up to 7 extra bits for overflow from reading a byte.
+				// Split it into two reads.
+				n, err = r.readBits(16)
+				if err != nil {
+					return 0, err
+				}
+				m := uint32(n) << 16
+				n, err = r.readBits(16)
+				return m | uint32(n), err
+			}
+		}
+		n, err = r.readBits(4 << uint(n))
+		return uint32(n), err
+	}
+	n, err = r.readBits(4)
+	if err != nil {
+		return 0, err
+	}
+	if n == 0 {
+		n, err = r.readBits(8)
+		n |= -1 << 8
+		return uint32(n), err
+	}
+	nlow, err := r.readBits(4)
+	n = n<<4 | nlow
+	return uint32(n), err
+}
+
+// ReadByte() returns a byte directly from buf b or the io.ByteReader r.
+// Current bit offsets are ignored.
+func (r *rarBitReader) ReadByte() (byte, error) {
+	if len(r.b) == 0 {
+		if r.r == nil {
+			return 0, io.EOF
+		}
+		var err error
+		r.b, err = r.r.bytes()
+		if err != nil {
+			return 0, err
+		}
+	}
+	c := r.b[0]
+	r.b = r.b[1:]
+	return c, nil
+}
+
+// readFull reads len(p) bytes into p. If fewer bytes are read an error is returned.
+func (r *rarBitReader) readFull(p []byte) error {
+	if r.n == 0 && len(r.b) > 0 {
+		n := copy(p, r.b)
+		p = p[n:]
+		r.b = r.b[n:]
+	}
+	for i := range p {
+		n, err := r.readBits(8)
+		if err != nil {
+			return err
+		}
+		p[i] = byte(n)
+	}
+	return nil
+}
+
+func newRarBitReader(r byteReader) *rarBitReader {
+	return &rarBitReader{r: r}
+}
diff --git a/vendor/github.com/nwaples/rardecode/v2/decode20.go b/vendor/github.com/nwaples/rardecode/v2/decode20.go
new file mode 100644
index 0000000000..3a5ed5e937
--- /dev/null
+++ b/vendor/github.com/nwaples/rardecode/v2/decode20.go
@@ -0,0 +1,161 @@
+package rardecode
+
+import (
+	"io"
+)
+
+const audioSize = 257
+
+type decoder20 struct {
+	br      *rarBitReader
+	size    int64 // unpacked bytes left to be decompressed
+	hdrRead bool  // block header has been read
+	isAudio bool  // current block is Audio
+
+	codeLength [audioSize * 4]byte
+
+	lz    *lz20Decoder
+	audio *audio20Decoder
+}
+
+func (d *decoder20) version() int { return decode20Ver }
+
+// init intializes the decoder for decoding a new file.
+func (d *decoder20) init(r byteReader, reset bool, size int64, ver int) {
+	if d.br == nil {
+		d.br = newRarBitReader(r)
+	} else {
+		d.br.reset(r)
+	}
+	d.size = size
+	if reset {
+		d.hdrRead = false
+		d.isAudio = false
+		if d.audio != nil {
+			d.audio.reset()
+		}
+		clear(d.codeLength[:])
+	}
+}
+
+func readCodeLengthTable20(br *rarBitReader, table []byte) error {
+	var bitlength [19]byte
+	for i := 0; i < len(bitlength); i++ {
+		n, err := br.readBits(4)
+		if err != nil {
+			return err
+		}
+		bitlength[i] = byte(n)
+	}
+
+	var bl huffmanDecoder
+	bl.init(bitlength[:])
+
+	for i := 0; i < len(table); {
+		l, err := bl.readSym(br)
+		if err != nil {
+			return err
+		}
+		if l < 16 {
+			table[i] = (table[i] + byte(l)) & 0xf
+			i++
+			continue
+		}
+		if l == 16 {
+			if i == 0 {
+				return ErrInvalidLengthTable
+			}
+			var n int
+			n, err = br.readBits(2)
+			if err != nil {
+				return err
+			}
+			n += 3
+			n = min(i+n, len(table))
+			v := table[i-1]
+			for i < n {
+				table[i] = v
+				i++
+			}
+			continue
+		}
+		var n int
+		if l == 17 {
+			n, err = br.readBits(3)
+			if err != nil {
+				return err
+			}
+			n += 3
+		} else {
+			n, err = br.readBits(7)
+			if err != nil {
+				return err
+			}
+			n += 11
+		}
+		n = min(i+n, len(table))
+		clear(table[i:n])
+		i = n
+	}
+	return nil
+}
+
+func (d *decoder20) readBlockHeader() error {
+	n, err := d.br.readBits(1)
+	if err != nil {
+		return err
+	}
+	d.isAudio = n > 0
+	n, err = d.br.readBits(1)
+	if err != nil {
+		return err
+	}
+	if n == 0 {
+		clear(d.codeLength[:])
+	}
+	if d.isAudio {
+		if d.audio == nil {
+			d.audio = new(audio20Decoder)
+		}
+		err = d.audio.init(d.br, d.codeLength[:])
+	} else {
+		if d.lz == nil {
+			d.lz = new(lz20Decoder)
+		}
+		err = d.lz.init(d.br, d.codeLength[:])
+	}
+	d.hdrRead = true
+	return err
+}
+
+func (d *decoder20) fill(dr *decodeReader) error {
+	for d.size > 0 && dr.notFull() {
+		if !d.hdrRead {
+			if err := d.readBlockHeader(); err != nil {
+				return err
+			}
+		}
+		var n int64
+		var err error
+		if d.isAudio {
+			n, err = d.audio.fill(dr, d.size)
+		} else {
+			n, err = d.lz.fill(dr, d.size)
+		}
+		d.size -= n
+		switch err {
+		case nil:
+			continue
+		case errEndOfBlock:
+			d.hdrRead = false
+			continue
+		case io.EOF:
+			err = ErrDecoderOutOfData
+		}
+		return err
+	}
+	if d.size == 0 {
+		return io.EOF
+	}
+	return nil
+}
diff --git a/vendor/github.com/nwaples/rardecode/v2/decode20_audio.go b/vendor/github.com/nwaples/rardecode/v2/decode20_audio.go
new file mode 100644
index 0000000000..6258cb2ab8
--- /dev/null
+++ b/vendor/github.com/nwaples/rardecode/v2/decode20_audio.go
@@ -0,0 +1,128 @@
+package rardecode
+
+type audioVar struct {
+	k         [5]int
+	d         [4]int
+	lastDelta int
+	dif       [11]int
+	byteCount int
+	lastChar  int
+}
+
+type audio20Decoder struct {
+	chans     int // number of audio channels
+	curChan   int // current audio channel
+	chanDelta int
+
+	decoders [4]huffmanDecoder
+	vars     [4]audioVar
+
+	br *rarBitReader
+}
+
+func (d *audio20Decoder) reset() {
+	d.chans = 1
+	d.curChan = 0
+	d.chanDelta = 0
+
+	for i := range d.vars {
+		d.vars[i] = audioVar{}
+	}
+}
+
+func (d *audio20Decoder) init(br *rarBitReader, table []byte) error {
+	d.br = br
+	n, err := br.readBits(2)
+	if err != nil {
+		return err
+	}
+	d.chans = n + 1
+	if d.curChan >= d.chans {
+		d.curChan = 0
+	}
+	table = table[:audioSize*d.chans]
+	if err = readCodeLengthTable20(br, table); err != nil {
+		return err
+	}
+	for i := 0; i < d.chans; i++ {
+		d.decoders[i].init(table[:audioSize])
+		table = table[audioSize:]
+	}
+	return nil
+}
+
+func (d *audio20Decoder) decode(delta int) byte {
+	v := &d.vars[d.curChan]
+	v.byteCount++
+	v.d[3] = v.d[2]
+	v.d[2] = v.d[1]
+	v.d[1] = v.lastDelta - v.d[0]
+	v.d[0] = v.lastDelta
+	pch := 8*v.lastChar + v.k[0]*v.d[0] + v.k[1]*v.d[1] + v.k[2]*v.d[2] + v.k[3]*v.d[3] + v.k[4]*d.chanDelta
+	pch = (pch >> 3) & 0xFF
+	ch := pch - delta
+	delta <<= 3
+
+	v.dif[0] += abs(delta)
+	v.dif[1] += abs(delta - v.d[0])
+	v.dif[2] += abs(delta + v.d[0])
+	v.dif[3] += abs(delta - v.d[1])
+	v.dif[4] += abs(delta + v.d[1])
+	v.dif[5] += abs(delta - v.d[2])
+	v.dif[6] += abs(delta + v.d[2])
+	v.dif[7] += abs(delta - v.d[3])
+	v.dif[8] += abs(delta + v.d[3])
+	v.dif[9] += abs(delta - d.chanDelta)
+	v.dif[10] += abs(delta + d.chanDelta)
+
+	d.chanDelta = ch - v.lastChar
+	v.lastDelta = d.chanDelta
+	v.lastChar = ch
+
+	if v.byteCount&0x1F != 0 {
+		return byte(ch)
+	}
+
+	var numMinDif int
+	minDif := v.dif[0]
+	v.dif[0] = 0
+	for i := 1; i < len(v.dif); i++ {
+		if v.dif[i] < minDif {
+			minDif = v.dif[i]
+			numMinDif = i
+		}
+		v.dif[i] = 0
+	}
+	if numMinDif > 0 {
+		numMinDif--
+		i := numMinDif / 2
+		if numMinDif%2 == 0 {
+			if v.k[i] >= -16 {
+				v.k[i]--
+			}
+		} else if v.k[i] < 16 {
+			v.k[i]++
+		}
+	}
+	return byte(ch)
+}
+
+func (d *audio20Decoder) fill(dr *decodeReader, size int64) (int64, error) {
+	var n int64
+	for n < size && dr.notFull() {
+		sym, err := d.decoders[d.curChan].readSym(d.br)
+		if err != nil {
+			return n, err
+		}
+		if sym == 256 {
+			return n, errEndOfBlock
+		}
+		dr.writeByte(d.decode(sym))
+		n++
+		d.curChan++
+		if d.curChan >= d.chans {
+			d.curChan = 0
+		}
+	}
+	return n, nil
+}
diff --git a/vendor/github.com/nwaples/rardecode/v2/decode20_lz.go b/vendor/github.com/nwaples/rardecode/v2/decode20_lz.go
new file mode 100644
index 0000000000..0134b3c142
--- /dev/null
+++ b/vendor/github.com/nwaples/rardecode/v2/decode20_lz.go
@@ -0,0 +1,150 @@
+package rardecode
+
+const (
+	main20Size   = 298
+	offset20Size = 48
+	length20Size = 28
+)
+
+type lz20Decoder struct {
+	length int    // previous length
+	offset [4]int // history of previous offsets
+
+	mainDecoder   huffmanDecoder
+	offsetDecoder huffmanDecoder
+	lengthDecoder huffmanDecoder
+
+	br *rarBitReader
+}
+
+func (d *lz20Decoder) init(br *rarBitReader, table []byte) error {
+	d.br = br
+
+	table = table[:main20Size+offset20Size+length20Size]
+	if err := readCodeLengthTable20(br, table); err != nil {
+		return err
+	}
+	d.mainDecoder.init(table[:main20Size])
+	table = table[main20Size:]
+	d.offsetDecoder.init(table[:offset20Size])
+	table = table[offset20Size:]
+	d.lengthDecoder.init(table)
+	return nil
+}
+
+func (d *lz20Decoder) decodeOffset(i int) error {
+	d.length = lengthBase[i] + 3
+	bits := lengthExtraBits[i]
+	if bits > 0 {
+		n, err := d.br.readBits(bits)
+		if err != nil {
+			return err
+		}
+		d.length += n
+	}
+
+	var err error
+	i, err = d.offsetDecoder.readSym(d.br)
+	if err != nil {
+		return err
+	}
+	offset := offsetBase[i] + 1
+	bits = offsetExtraBits[i]
+	if bits > 0 {
+		n, err := d.br.readBits(bits)
+		if err != nil {
+			return err
+		}
+		offset += n
+	}
+
+	if offset >= 0x2000 {
+		d.length++
+		if offset >= 0x40000 {
+			d.length++
+		}
+	}
+	copy(d.offset[1:], d.offset[:])
+	d.offset[0] = offset
+	return nil
+}
+
+func (d *lz20Decoder) decodeLength(i int) error {
+	offset := d.offset[i]
+	copy(d.offset[1:], d.offset[:])
+	d.offset[0] = offset
+
+	i, err := d.lengthDecoder.readSym(d.br)
+	if err != nil {
+		return err
+	}
+	d.length = lengthBase[i] + 2
+	bits := lengthExtraBits[i]
+	if bits > 0 {
+		var n int
+		n, err = d.br.readBits(bits)
+		if err != nil {
+			return err
+		}
+		d.length += n
+	}
+	if offset >= 0x101 {
+		d.length++
+		if offset >= 0x2000 {
+			d.length++
+			if offset >= 0x40000 {
+				d.length++
+			}
+		}
+	}
+	return nil
+}
+
+func (d *lz20Decoder) decodeShortOffset(i int) error {
+	copy(d.offset[1:], d.offset[:])
+	offset := shortOffsetBase[i] + 1
+	bits := shortOffsetExtraBits[i]
+	if bits > 0 {
+		n, err := d.br.readBits(bits)
+		if err != nil {
+			return err
+		}
+		offset += n
+	}
+	d.offset[0] = offset
+	d.length = 2
+	return nil
+}
+
+func (d *lz20Decoder) fill(dr *decodeReader, size int64) (int64, error) {
+	var n int64
+	for n < size && dr.notFull() {
+		sym, err := d.mainDecoder.readSym(d.br)
+		if err != nil {
+			return n, err
+		}
+
+		switch {
+		case sym < 256: // literal
+			dr.writeByte(byte(sym))
+			n++
+			continue
+		case sym > 269:
+			err = d.decodeOffset(sym - 270)
+		case sym == 269:
+			return n, errEndOfBlock
+		case sym == 256: // use previous offset and length
+			copy(d.offset[1:], d.offset[:])
+		case sym < 261:
+			err = d.decodeLength(sym - 257)
+		default:
+			err = d.decodeShortOffset(sym - 261)
+		}
+		if err != nil {
+			return n, err
+		}
+		dr.copyBytes(d.length, d.offset[0])
+		n += int64(d.length)
+	}
+	return n, nil
+}
diff --git a/vendor/github.com/nwaples/rardecode/decode29.go b/vendor/github.com/nwaples/rardecode/v2/decode29.go
similarity index 72%
rename from vendor/github.com/nwaples/rardecode/decode29.go
rename to vendor/github.com/nwaples/rardecode/v2/decode29.go
index 638645e79b..cb47f1a16a 100644
--- a/vendor/github.com/nwaples/rardecode/decode29.go
+++ b/vendor/github.com/nwaples/rardecode/v2/decode29.go
@@ -1,7 +1,6 @@
 package rardecode
 
 import (
-	"bytes"
 	"errors"
 	"io"
 )
@@ -13,9 +12,9 @@ const (
 
 var (
 	// Errors marking the end of the decoding block and/or file
-	endOfFile         = errors.New("rardecode: end of file")
-	endOfBlock        = errors.New("rardecode: end of block")
-	endOfBlockAndFile = errors.New("rardecode: end of block and file")
+	errEndOfFile         = errors.New("rardecode: end of file")
+	errEndOfBlock        = errors.New("rardecode: end of block")
+	errEndOfBlockAndFile = errors.New("rardecode: end of block and file")
 )
 
 // decoder29 implements the decoder interface for RAR 3.0 compression (unpack version 29)
@@ -25,22 +24,21 @@ var (
 // block marker in the data.
 type decoder29 struct {
 	br      *rarBitReader
+	hdrRead bool       // block header has been read
+	isPPM   bool       // current block is PPM
 	eof     bool       // at file eof
 	fnum    int        // current filter number (index into filters)
 	flen    []int      // filter block length history
 	filters []v3Filter // list of current filters used by archive encoding
 
-	// current decode function (lz or ppm).
-	// When called it should perform a single decode operation, and either apply the
-	// data to the window or return they raw bytes for a filter.
-	decode func(w *window) ([]byte, error)
-
-	lz  lz29Decoder  // lz decoder
-	ppm ppm29Decoder // ppm decoder
+	lz  *lz29Decoder  // lz decoder
+	ppm *ppm29Decoder // ppm decoder
 }
 
+func (d *decoder29) version() int { return decode29Ver }
+
 // init intializes the decoder for decoding a new file.
-func (d *decoder29) init(r io.ByteReader, reset bool) error {
+func (d *decoder29) init(r byteReader, reset bool, size int64, ver int) {
 	if d.br == nil {
 		d.br = newRarBitReader(r)
 	} else {
@@ -49,14 +47,14 @@ func (d *decoder29) init(r io.ByteReader, reset bool) error {
 	d.eof = false
 	if reset {
 		d.initFilters()
-		d.lz.reset()
-		d.ppm.reset()
-		d.decode = nil
-	}
-	if d.decode == nil {
-		return d.readBlockHeader()
+		if d.lz != nil {
+			d.lz.reset()
+		}
+		if d.ppm != nil {
+			d.ppm.reset()
+		}
+		d.hdrRead = false
 	}
-	return nil
 }
 
 func (d *decoder29) initFilters() {
@@ -72,7 +70,7 @@ func readVMCode(br *rarBitReader) ([]byte, error) {
 		return nil, err
 	}
 	if n > maxCodeSize || n == 0 {
-		return nil, errInvalidFilter
+		return nil, ErrInvalidFilter
 	}
 	buf := make([]byte, n)
 	err = br.readFull(buf)
@@ -85,14 +83,14 @@ func readVMCode(br *rarBitReader) ([]byte, error) {
 	}
 	// simple xor checksum on data
 	if x != buf[0] {
-		return nil, errInvalidFilter
+		return nil, ErrInvalidFilter
 	}
 	return buf, nil
 }
 
 func (d *decoder29) parseVMFilter(buf []byte) (*filterBlock, error) {
 	flags := buf[0]
-	br := newRarBitReader(bytes.NewReader(buf[1:]))
+	br := newRarBitReader(newBufByteReader(buf[1:]))
 	fb := new(filterBlock)
 
 	// Find the filter number which is an index into d.filters.
@@ -104,14 +102,13 @@ func (d *decoder29) parseVMFilter(buf []byte) (*filterBlock, error) {
 		}
 		if n == 0 {
 			d.initFilters()
-			fb.reset = true
 		} else {
 			n--
 			if n > maxUniqueFilters {
-				return nil, errInvalidFilter
+				return nil, ErrInvalidFilter
 			}
 			if int(n) > len(d.filters) {
-				return nil, errInvalidFilter
+				return nil, ErrInvalidFilter
 			}
 		}
 		d.fnum = int(n)
@@ -181,7 +178,7 @@ func (d *decoder29) parseVMFilter(buf []byte) (*filterBlock, error) {
 			return nil, err
 		}
 		if n > vmGlobalSize-vmFixedGlobalSize {
-			return nil, errInvalidFilter
+			return nil, ErrInvalidFilter
 		}
 		g = make([]byte, n)
 		err = br.readFull(g)
@@ -205,60 +202,70 @@ func (d *decoder29) readBlockHeader() error {
 	n, err := d.br.readBits(1)
 	if err == nil {
 		if n > 0 {
-			d.decode = d.ppm.decode
+			d.isPPM = true
+			if d.ppm == nil {
+				d.ppm = newPPM29Decoder()
+			}
 			err = d.ppm.init(d.br)
 		} else {
-			d.decode = d.lz.decode
+			d.isPPM = false
+			if d.lz == nil {
+				d.lz = new(lz29Decoder)
+			}
 			err = d.lz.init(d.br)
 		}
 	}
 	if err == io.EOF {
-		err = errDecoderOutOfData
+		err = ErrDecoderOutOfData
 	}
+	d.hdrRead = true
 	return err
-
 }
 
-func (d *decoder29) fill(w *window) ([]*filterBlock, error) {
+func (d *decoder29) fill(dr *decodeReader) error {
 	if d.eof {
-		return nil, io.EOF
+		return io.EOF
 	}
 
-	var fl []*filterBlock
-
-	for w.available() > 0 {
-		b, err := d.decode(w) // perform a single decode operation
+	for dr.notFull() {
+		var err error
+		if !d.hdrRead {
+			if err = d.readBlockHeader(); err != nil {
+				return err
+			}
+		}
+		var b []byte
+		if d.isPPM {
+			b, err = d.ppm.fill(dr)
+		} else {
+			b, err = d.lz.fill(dr)
+		}
 		if len(b) > 0 && err == nil {
 			// parse raw data for filter and add to list of filters
 			var f *filterBlock
 			f, err = d.parseVMFilter(b)
 			if f != nil {
-				// make offset relative to read index (from write index)
-				f.offset += w.buffered()
-				fl = append(fl, f)
+				err = dr.queueFilter(f)
 			}
 		}
 
 		switch err {
 		case nil:
 			continue
-		case endOfBlock:
-			err = d.readBlockHeader()
-			if err == nil {
-				continue
-			}
-		case endOfFile:
+		case errEndOfBlock:
+			d.hdrRead = false
+			continue
+		case errEndOfFile:
 			d.eof = true
 			err = io.EOF
-		case endOfBlockAndFile:
+		case errEndOfBlockAndFile:
 			d.eof = true
-			d.decode = nil // clear decoder, it will be setup by next init()
+			d.hdrRead = false
 			err = io.EOF
 		case io.EOF:
-			err = errDecoderOutOfData
+			err = ErrDecoderOutOfData
 		}
-		return fl, err
+		return err
 	}
-	// return filters
-	return fl, nil
+	return nil
 }
diff --git a/vendor/github.com/nwaples/rardecode/decode29_lz.go b/vendor/github.com/nwaples/rardecode/v2/decode29_lz.go
similarity index 51%
rename from vendor/github.com/nwaples/rardecode/decode29_lz.go
rename to vendor/github.com/nwaples/rardecode/v2/decode29_lz.go
index 94470853dc..4133354f39 100644
--- a/vendor/github.com/nwaples/rardecode/decode29_lz.go
+++ b/vendor/github.com/nwaples/rardecode/v2/decode29_lz.go
@@ -11,7 +11,7 @@ const (
 var (
 	lengthBase = [28]int{0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20,
 		24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224}
-	lengthExtraBits = [28]uint{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2,
+	lengthExtraBits = [28]uint8{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2,
 		2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5}
 
 	offsetBase = [60]int{0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96,
@@ -21,13 +21,13 @@ var (
 		589824, 655360, 720896, 786432, 851968, 917504, 983040,
 		1048576, 1310720, 1572864, 1835008, 2097152, 2359296, 2621440,
 		2883584, 3145728, 3407872, 3670016, 3932160}
-	offsetExtraBits = [60]uint{0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+	offsetExtraBits = [60]uint8{0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
 		6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
 		15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
 		18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}
 
 	shortOffsetBase      = [8]int{0, 4, 8, 16, 32, 64, 128, 192}
-	shortOffsetExtraBits = [8]uint{2, 2, 3, 4, 5, 6, 6, 6}
+	shortOffsetExtraBits = [8]uint8{2, 2, 3, 4, 5, 6, 6, 6}
 )
 
 type lz29Decoder struct {
@@ -47,13 +47,9 @@ type lz29Decoder struct {
 }
 
 func (d *lz29Decoder) reset() {
-	for i := range d.offset {
-		d.offset[i] = 0
-	}
+	clear(d.offset[:])
 	d.length = 0
-	for i := range d.codeLength {
-		d.codeLength[i] = 0
-	}
+	clear(d.codeLength[:])
 }
 
 func (d *lz29Decoder) init(br *rarBitReader) error {
@@ -84,12 +80,12 @@ func (d *lz29Decoder) init(br *rarBitReader) error {
 }
 
 func (d *lz29Decoder) readFilterData() (b []byte, err error) {
-	flags, err := d.br.ReadByte()
+	flags, err := d.br.readBits(8)
 	if err != nil {
 		return nil, err
 	}
 
-	n := (int(flags) & 7) + 1
+	n := flags&7 + 1
 	switch n {
 	case 7:
 		n, err = d.br.readBits(8)
@@ -105,7 +101,7 @@ func (d *lz29Decoder) readFilterData() (b []byte, err error) {
 	}
 
 	buf := make([]byte, n+1)
-	buf[0] = flags
+	buf[0] = byte(flags)
 	err = d.br.readFull(buf[1:])
 
 	return buf, err
@@ -117,131 +113,150 @@ func (d *lz29Decoder) readEndOfBlock() error {
 		return err
 	}
 	if n > 0 {
-		return endOfBlock
+		return errEndOfBlock
 	}
 	n, err = d.br.readBits(1)
 	if err != nil {
 		return err
 	}
 	if n > 0 {
-		return endOfBlockAndFile
+		return errEndOfBlockAndFile
 	}
-	return endOfFile
+	return errEndOfFile
 }
 
-func (d *lz29Decoder) decode(win *window) ([]byte, error) {
-	sym, err := d.mainDecoder.readSym(d.br)
+func (d *lz29Decoder) decodeLength(i int) error {
+	offset := d.offset[i]
+	copy(d.offset[1:i+1], d.offset[:i])
+	d.offset[0] = offset
+
+	i, err := d.lengthDecoder.readSym(d.br)
 	if err != nil {
-		return nil, err
+		return err
 	}
+	d.length = lengthBase[i] + 2
+	bits := lengthExtraBits[i]
+	if bits > 0 {
+		var n int
+		n, err = d.br.readBits(bits)
+		if err != nil {
+			return err
+		}
+		d.length += n
+	}
+	return nil
+}
 
-	switch {
-	case sym < 256:
-		// literal
-		win.writeByte(byte(sym))
-		return nil, nil
-	case sym == 256:
-		return nil, d.readEndOfBlock()
-	case sym == 257:
-		return d.readFilterData()
-	case sym == 258:
-		// use previous offset and length
-	case sym < 263:
-		i := sym - 259
-		offset := d.offset[i]
-		copy(d.offset[1:i+1], d.offset[:i])
-		d.offset[0] = offset
-
-		i, err := d.lengthDecoder.readSym(d.br)
+func (d *lz29Decoder) decodeShortOffset(i int) error {
+	copy(d.offset[1:], d.offset[:])
+	offset := shortOffsetBase[i] + 1
+	bits := shortOffsetExtraBits[i]
+	if bits > 0 {
+		n, err := d.br.readBits(bits)
 		if err != nil {
-			return nil, err
+			return err
 		}
-		d.length = lengthBase[i] + 2
-		bits := lengthExtraBits[i]
-		if bits > 0 {
-			n, err := d.br.readBits(bits)
-			if err != nil {
-				return nil, err
-			}
-			d.length += n
+		offset += n
+	}
+	d.offset[0] = offset
+	d.length = 2
+	return nil
+}
+
+func (d *lz29Decoder) decodeOffset(i int) error {
+	d.length = lengthBase[i] + 3
+	bits := lengthExtraBits[i]
+	if bits > 0 {
+		n, err := d.br.readBits(bits)
+		if err != nil {
+			return err
 		}
-	case sym < 271:
-		i := sym - 263
-		copy(d.offset[1:], d.offset[:])
-		offset := shortOffsetBase[i] + 1
-		bits := shortOffsetExtraBits[i]
-		if bits > 0 {
-			n, err := d.br.readBits(bits)
+		d.length += n
+	}
+
+	var err error
+	i, err = d.offsetDecoder.readSym(d.br)
+	if err != nil {
+		return err
+	}
+	offset := offsetBase[i] + 1
+	bits = offsetExtraBits[i]
+
+	switch {
+	case bits >= 4:
+		if bits > 4 {
+			n, err := d.br.readBits(bits - 4)
 			if err != nil {
-				return nil, err
+				return err
 			}
-			offset += n
+			offset += n << 4
 		}
-		d.offset[0] = offset
-
-		d.length = 2
-	default:
-		i := sym - 271
-		d.length = lengthBase[i] + 3
-		bits := lengthExtraBits[i]
-		if bits > 0 {
-			n, err := d.br.readBits(bits)
+
+		if d.lowOffsetRepeats > 0 {
+			d.lowOffsetRepeats--
+			offset += d.lowOffset
+		} else {
+			n, err := d.lowOffsetDecoder.readSym(d.br)
 			if err != nil {
-				return nil, err
+				return err
+			}
+			if n == 16 {
+				d.lowOffsetRepeats = 15
+				offset += d.lowOffset
+			} else {
+				offset += n
+				d.lowOffset = n
 			}
-			d.length += n
 		}
+	case bits > 0:
+		n, err := d.br.readBits(bits)
+		if err != nil {
+			return err
+		}
+		offset += n
+	}
+
+	if offset >= 0x2000 {
+		d.length++
+		if offset >= 0x40000 {
+			d.length++
+		}
+	}
+	copy(d.offset[1:], d.offset[:])
+	d.offset[0] = offset
+	return nil
+}
 
-		i, err = d.offsetDecoder.readSym(d.br)
+// fill window until full, error, filter found or end of block.
+func (d *lz29Decoder) fill(dr *decodeReader) ([]byte, error) {
+	for dr.notFull() {
+		sym, err := d.mainDecoder.readSym(d.br)
 		if err != nil {
 			return nil, err
 		}
-		offset := offsetBase[i] + 1
-		bits = offsetExtraBits[i]
 
 		switch {
-		case bits >= 4:
-			if bits > 4 {
-				n, err := d.br.readBits(bits - 4)
-				if err != nil {
-					return nil, err
-				}
-				offset += n << 4
-			}
-
-			if d.lowOffsetRepeats > 0 {
-				d.lowOffsetRepeats--
-				offset += d.lowOffset
-			} else {
-				n, err := d.lowOffsetDecoder.readSym(d.br)
-				if err != nil {
-					return nil, err
-				}
-				if n == 16 {
-					d.lowOffsetRepeats = 15
-					offset += d.lowOffset
-				} else {
-					offset += n
-					d.lowOffset = n
-				}
-			}
-		case bits > 0:
-			n, err := d.br.readBits(bits)
-			if err != nil {
-				return nil, err
-			}
-			offset += n
+		case sym < 256: // literal
+			dr.writeByte(byte(sym))
+			continue
+		case sym == 258: // use previous offset and length
+			dr.copyBytes(d.length, d.offset[0])
+			continue
+		case sym >= 271:
+			err = d.decodeOffset(sym - 271)
+		case sym >= 263:
+			err = d.decodeShortOffset(sym - 263)
+		case sym >= 259:
+			err = d.decodeLength(sym - 259)
+		case sym == 256:
+			return nil, d.readEndOfBlock()
+		default: // sym == 257
+			return d.readFilterData()
 		}
-
-		if offset >= 0x2000 {
-			d.length++
-			if offset >= 0x40000 {
-				d.length++
-			}
+		if err != nil {
+			return nil, err
 		}
-		copy(d.offset[1:], d.offset[:])
-		d.offset[0] = offset
+		dr.copyBytes(d.length, d.offset[0])
 	}
-	win.copyBytes(d.length, d.offset[0])
 	return nil, nil
 }
diff --git a/vendor/github.com/nwaples/rardecode/decode29_ppm.go b/vendor/github.com/nwaples/rardecode/v2/decode29_ppm.go
similarity index 50%
rename from vendor/github.com/nwaples/rardecode/decode29_ppm.go
rename to vendor/github.com/nwaples/rardecode/v2/decode29_ppm.go
index 39c3199584..365b8529c3 100644
--- a/vendor/github.com/nwaples/rardecode/decode29_ppm.go
+++ b/vendor/github.com/nwaples/rardecode/v2/decode29_ppm.go
@@ -1,11 +1,9 @@
 package rardecode
 
-import "io"
-
 type ppm29Decoder struct {
 	m   model // ppm model
 	esc byte  // escape character
-	br  io.ByteReader
+	br  *rarBitReader
 }
 
 func (d *ppm29Decoder) init(br *rarBitReader) error {
@@ -15,13 +13,16 @@ func (d *ppm29Decoder) init(br *rarBitReader) error {
 	}
 	reset := maxOrder&0x20 > 0
 
-	// Should have flushed all unread bits from bitReader by now,
-	// use underlying ByteReader
-	d.br = br.r
+	// Move any bytes in rarBitReader bit cache back into a byte slice.
+	// PPM only reads bytes so it is more efficient to read those bytes
+	// directly from byte slices, bypassing the extra bit shifts.
+	br.unshiftBytes()
+	d.br = br
 
 	var maxMB int
 	if reset {
-		c, err := d.br.ReadByte()
+		var c byte
+		c, err = d.br.ReadByte()
 		if err != nil {
 			return err
 		}
@@ -54,13 +55,15 @@ func (d *ppm29Decoder) readFilterData() ([]byte, error) {
 	}
 	n := int(c&7) + 1
 	if n == 7 {
-		b, err := d.m.ReadByte()
+		var b byte
+		b, err = d.m.ReadByte()
 		if err != nil {
 			return nil, err
 		}
 		n += int(b)
 	} else if n == 8 {
-		b, err := d.m.ReadByte()
+		var b byte
+		b, err = d.m.ReadByte()
 		if err != nil {
 			return nil, err
 		}
@@ -74,7 +77,7 @@ func (d *ppm29Decoder) readFilterData() ([]byte, error) {
 
 	n++
 	buf := make([]byte, n)
-	buf[0] = byte(c)
+	buf[0] = c
 	for i := 1; i < n; i++ {
 		buf[i], err = d.m.ReadByte()
 		if err != nil {
@@ -84,49 +87,61 @@ func (d *ppm29Decoder) readFilterData() ([]byte, error) {
 	return buf, nil
 }
 
-func (d *ppm29Decoder) decode(w *window) ([]byte, error) {
-	c, err := d.m.ReadByte()
-	if err != nil {
-		return nil, err
-	}
-	if c != d.esc {
-		w.writeByte(c)
-		return nil, nil
-	}
-	c, err = d.m.ReadByte()
-	if err != nil {
-		return nil, err
-	}
-
-	switch c {
-	case 0:
-		return nil, endOfBlock
-	case 2:
-		return nil, endOfBlockAndFile
-	case 3:
-		return d.readFilterData()
-	case 4:
-		offset := 0
-		for i := 0; i < 3; i++ {
-			c, err = d.m.ReadByte()
-			if err != nil {
-				return nil, err
-			}
-			offset = offset<<8 | int(c)
-		}
-		len, err := d.m.ReadByte()
+// fill window until full, error, filter found or end of block.
+func (d *ppm29Decoder) fill(dr *decodeReader) ([]byte, error) {
+	for dr.notFull() {
+		c, err := d.m.ReadByte()
 		if err != nil {
 			return nil, err
 		}
-		w.copyBytes(int(len)+32, offset+2)
-	case 5:
-		len, err := d.m.ReadByte()
+		if c != d.esc {
+			dr.writeByte(c)
+			continue
+		}
+		c, err = d.m.ReadByte()
 		if err != nil {
 			return nil, err
 		}
-		w.copyBytes(int(len)+4, 1)
-	default:
-		w.writeByte(d.esc)
+
+		switch c {
+		case 0:
+			return nil, errEndOfBlock
+		case 2:
+			return nil, errEndOfBlockAndFile
+		case 3:
+			return d.readFilterData()
+		case 4:
+			offset := 0
+			for i := 0; i < 3; i++ {
+				c, err = d.m.ReadByte()
+				if err != nil {
+					return nil, err
+				}
+				offset = offset<<8 | int(c)
+			}
+			len, err := d.m.ReadByte()
+			if err != nil {
+				return nil, err
+			}
+			dr.copyBytes(int(len)+32, offset+2)
+		case 5:
+			len, err := d.m.ReadByte()
+			if err != nil {
+				return nil, err
+			}
+			dr.copyBytes(int(len)+4, 1)
+		default:
+			dr.writeByte(d.esc)
+		}
 	}
 	return nil, nil
 }
+
+func newPPM29Decoder() *ppm29Decoder {
+	ppm := new(ppm29Decoder)
+	ppm.reset()
+	ppm.m.maxOrder = 2
+	ppm.m.a.init(1)
+
+	return ppm
+}
diff --git a/vendor/github.com/nwaples/rardecode/v2/decode50.go b/vendor/github.com/nwaples/rardecode/v2/decode50.go
new file mode 100644
index 0000000000..2e131c6f76
--- /dev/null
+++ b/vendor/github.com/nwaples/rardecode/v2/decode50.go
@@ -0,0 +1,311 @@
+package rardecode
+
+import (
+	"errors"
+	"io"
+	"math/bits"
+)
+
+const (
+	mainSize5      = 306
+	offsetSize5    = 64
+	lowoffsetSize5 = 16
+	lengthSize5    = 44
+	tableSize5     = mainSize5 + offsetSize5 + lowoffsetSize5 + lengthSize5
+
+	offsetSize7 = 80
+	tableSize7  = mainSize5 + offsetSize7 + lowoffsetSize5 + lengthSize5
+)
+
+var (
+	ErrUnknownFilter       = errors.New("rardecode: unknown V5 filter")
+	ErrCorruptDecodeHeader = errors.New("rardecode: corrupt decode header")
+)
+
+// decoder50 implements the decoder interface for RAR 5 compression.
+// Decode input it broken up into 1 or more blocks. Each block starts with
+// a header containing block length and optional code length tables to initialize
+// the huffman decoders with.
+type decoder50 struct {
+	br         rar5BitReader // bit reader for current data block
+	buf        [tableSize7]byte
+	codeLength []byte
+	offsetSize int
+
+	lastBlock bool // current block is last block in compressed file
+
+	mainDecoder      huffmanDecoder
+	offsetDecoder    huffmanDecoder
+	lowoffsetDecoder huffmanDecoder
+	lengthDecoder    huffmanDecoder
+
+	offset [4]int
+	length int
+}
+
+func (d *decoder50) version() int { return decode50Ver }
+
+func (d *decoder50) init(r byteReader, reset bool, size int64, ver int) {
+	d.br.reset(r)
+	d.lastBlock = false
+	if ver == decode70Ver {
+		d.codeLength = d.buf[:]
+		d.offsetSize = offsetSize7
+	} else {
+		d.codeLength = d.buf[:tableSize5]
+		d.offsetSize = offsetSize5
+	}
+
+	if reset {
+		clear(d.offset[:])
+		d.length = 0
+		clear(d.codeLength[:])
+	}
+}
+
+func (d *decoder50) readBlockHeader() error {
+	flags, err := d.br.ReadByte()
+	if err != nil {
+		return err
+	}
+
+	bytecount := (flags>>3)&3 + 1
+	if bytecount == 4 {
+		return ErrCorruptDecodeHeader
+	}
+
+	hsum, err := d.br.ReadByte()
+	if err != nil {
+		return err
+	}
+
+	blockBits := int(flags)&0x07 + 1
+	blockBytes := 0
+	sum := 0x5a ^ flags
+	for i := byte(0); i < bytecount; i++ {
+		var n byte
+		n, err = d.br.ReadByte()
+		if err != nil {
+			return err
+		}
+		sum ^= n
+		blockBytes |= int(n) << (i * 8)
+	}
+	if sum != hsum { // bad header checksum
+		return ErrCorruptDecodeHeader
+	}
+	blockBits += (blockBytes - 1) * 8
+
+	// reset the bits limit
+	d.br.setLimit(blockBits)
+	d.lastBlock = flags&0x40 > 0
+
+	if flags&0x80 > 0 {
+		// read new code length tables and reinitialize huffman decoders
+		cl := d.codeLength[:]
+		err = readCodeLengthTable(&d.br, cl, false)
+		if err != nil {
+			return err
+		}
+		d.mainDecoder.init(cl[:mainSize5])
+		cl = cl[mainSize5:]
+		d.offsetDecoder.init(cl[:d.offsetSize])
+		cl = cl[d.offsetSize:]
+		d.lowoffsetDecoder.init(cl[:lowoffsetSize5])
+		cl = cl[lowoffsetSize5:]
+		d.lengthDecoder.init(cl)
+	}
+	return nil
+}
+
+func slotToLength(br bitReader, n int) (int, error) {
+	if n >= 8 {
+		bits := uint8(n/4 - 1)
+		n = (4 | (n & 3)) << bits
+		if bits > 0 {
+			b, err := br.readBits(bits)
+			if err != nil {
+				return 0, err
+			}
+			n |= b
+		}
+	}
+	n += 2
+	return n, nil
+}
+
+// readFilter5Data reads an encoded integer used in V5 filters.
+func readFilter5Data(br bitReader) (int, error) {
+	// TODO: should data really be uint? (for 32bit ints).
+	// It will be masked later anyway by decode window mask.
+	bytes, err := br.readBits(2)
+	if err != nil {
+		return 0, err
+	}
+	bytes++
+
+	var data int
+	for i := 0; i < bytes; i++ {
+		n, err := br.readBits(8)
+		if err != nil {
+			return 0, err
+		}
+		data |= n << (uint(i) * 8)
+	}
+	return data, nil
+}
+
+func (d *decoder50) readFilter(dr *decodeReader) error {
+	fb := new(filterBlock)
+	var err error
+
+	fb.offset, err = readFilter5Data(&d.br)
+	if err != nil {
+		return err
+	}
+	fb.length, err = readFilter5Data(&d.br)
+	if err != nil {
+		return err
+	}
+	ftype, err := d.br.readBits(3)
+	if err != nil {
+		return err
+	}
+	switch ftype {
+	case 0:
+		n, err := d.br.readBits(5)
+		if err != nil {
+			return err
+		}
+		fb.filter = func(buf []byte, offset int64) ([]byte, error) { return filterDelta(n+1, buf) }
+	case 1:
+		fb.filter = func(buf []byte, offset int64) ([]byte, error) { return filterE8(0xe8, true, buf, offset) }
+	case 2:
+		fb.filter = func(buf []byte, offset int64) ([]byte, error) { return filterE8(0xe9, true, buf, offset) }
+	case 3:
+		fb.filter = filterArm
+	default:
+		return ErrUnknownFilter
+	}
+	return dr.queueFilter(fb)
+}
+
+func (d *decoder50) decodeLength(dr *decodeReader, i int) error {
+	offset := d.offset[i]
+	copy(d.offset[1:i+1], d.offset[:i])
+	d.offset[0] = offset
+
+	sl, err := d.lengthDecoder.readSym(&d.br)
+	if err != nil {
+		return err
+	}
+	d.length, err = slotToLength(&d.br, sl)
+	if err == nil {
+		dr.copyBytes(d.length, d.offset[0])
+	}
+	return err
+}
+
+func (d *decoder50) decodeOffset(dr *decodeReader, i int) error {
+	length, err := slotToLength(&d.br, i)
+	if err != nil {
+		return err
+	}
+
+	offset := 1
+	slot, err := d.offsetDecoder.readSym(&d.br)
+	if err != nil {
+		return err
+	}
+	if slot < 4 {
+		offset += slot
+	} else {
+		bitCount := uint8(slot/2 - 1)
+		offset += (2 | (slot & 1)) << bitCount
+
+		if bitCount >= 4 {
+			bitCount -= 4
+			if bitCount > 0 {
+				if bits.UintSize == 32 {
+					// bitReader can only read at most intSize-8 bits.
+					// Split read into two parts.
+					if bitCount > 24 {
+						n, err := d.br.readBits(24)
+						if err != nil {
+							return err
+						}
+						bitCount -= 24
+						offset += n << (4 + bitCount)
+					}
+				}
+				n, err := d.br.readBits(bitCount)
+				if err != nil {
+					return err
+				}
+				offset += n << 4
+			}
+			n, err := d.lowoffsetDecoder.readSym(&d.br)
+			if err != nil {
+				return err
+			}
+			offset += n
+		} else {
+			n, err := d.br.readBits(bitCount)
+			if err != nil {
+				return err
+			}
+			offset += n
+		}
+	}
+	if offset > 0x100 {
+		length++
+		if offset > 0x2000 {
+			length++
+			if offset > 0x40000 {
+				length++
+			}
+		}
+	}
+	copy(d.offset[1:], d.offset[:])
+	d.offset[0] = offset
+	d.length = length
+	dr.copyBytes(d.length, d.offset[0])
+	return nil
+}
+
+func (d *decoder50) fill(dr *decodeReader) error {
+	for dr.notFull() {
+		sym, err := d.mainDecoder.readSym(&d.br)
+		if err == nil {
+			switch {
+			case sym < 256:
+				// literal
+				dr.writeByte(byte(sym))
+				continue
+			case sym >= 262:
+				err = d.decodeOffset(dr, sym-262)
+			case sym >= 258:
+				err = d.decodeLength(dr, sym-258)
+			case sym == 257:
+				// use previous offset and length
+				dr.copyBytes(d.length, d.offset[0])
+				continue
+			default: // sym == 256:
+				err = d.readFilter(dr)
+			}
+		} else if err == io.EOF {
+			// reached end of the block
+			if d.lastBlock {
+				return io.EOF
+			}
+			err = d.readBlockHeader()
+		}
+		if err != nil {
+			if err == io.EOF {
+				return ErrDecoderOutOfData
+			}
+			return err
+		}
+	}
+	return nil
+}
diff --git a/vendor/github.com/nwaples/rardecode/v2/decode_reader.go b/vendor/github.com/nwaples/rardecode/v2/decode_reader.go
new file mode 100644
index 0000000000..55fb1f695d
--- /dev/null
+++ b/vendor/github.com/nwaples/rardecode/v2/decode_reader.go
@@ -0,0 +1,308 @@
+package rardecode
+
+import "errors"
+
+const (
+	minWindowSize    = 0x40000
+	maxQueuedFilters = 8192
+)
+
+var (
+	ErrTooManyFilters   = errors.New("rardecode: too many filters")
+	ErrInvalidFilter    = errors.New("rardecode: invalid filter")
+	ErrMultipleDecoders = errors.New("rardecode: multiple decoders in a single archive not supported")
+)
+
+// filter functions take a byte slice, the current output offset and
+// returns transformed data.
+type filter func(b []byte, offset int64) ([]byte, error)
+
+// filterBlock is a block of data to be processed by a filter.
+type filterBlock struct {
+	length int    // length of block
+	offset int    // bytes to be read before start of block
+	filter filter // filter function
+}
+
+// decoder is the interface for decoding compressed data
+type decoder interface {
+	init(r byteReader, reset bool, size int64, ver int) // initialize decoder for current file
+	fill(dr *decodeReader) error                        // fill window with decoded data
+	version() int                                       // decoder version
+}
+
+// decodeReader implements io.Reader for decoding compressed data in RAR archives.
+type decodeReader struct {
+	tot    int64          // total bytes read from window
+	outbuf []byte         // buffered output
+	buf    []byte         // filter buffer
+	fl     []*filterBlock // list of filters each with offset relative to previous in list
+	dec    decoder        // decoder being used to unpack file
+	err    error          // current decoder error output
+	br     byteReader
+
+	win  []byte // sliding window buffer
+	size int    // win length
+	r    int    // index in win for reads (beginning)
+	w    int    // index in win for writes (end)
+}
+
+func (d *decodeReader) init(r byteReader, ver int, size int, reset bool, unPackedSize int64) error {
+	d.outbuf = nil
+	d.tot = 0
+	d.err = nil
+	if reset {
+		d.fl = nil
+	}
+	d.br = r
+
+	// initialize window
+	size = max(size, minWindowSize)
+	if size > len(d.win) {
+		b := make([]byte, size)
+		if reset {
+			d.w = 0
+		} else if len(d.win) > 0 {
+			n := copy(b, d.win[d.w:])
+			n += copy(b[n:], d.win[:d.w])
+			d.w = n
+		}
+		d.win = b
+		d.size = size
+	} else if reset {
+		clear(d.win[:])
+		d.w = 0
+	}
+	d.r = d.w
+
+	// initialize decoder
+	if d.dec == nil {
+		switch ver {
+		case decode29Ver:
+			d.dec = new(decoder29)
+		case decode50Ver, decode70Ver:
+			d.dec = new(decoder50)
+		case decode20Ver:
+			d.dec = new(decoder20)
+		default:
+			return ErrUnknownDecoder
+		}
+	} else if d.dec.version() != ver {
+		return ErrMultipleDecoders
+	}
+	d.dec.init(r, reset, unPackedSize, ver)
+	return nil
+}
+
+// notFull returns if the window is not full
+func (d *decodeReader) notFull() bool { return d.w < d.size }
+
+// writeByte writes c to the end of the window
+func (d *decodeReader) writeByte(c byte) {
+	d.win[d.w] = c
+	d.w++
+}
+
+// copyBytes copies len bytes at off distance from the end
+// to the end of the window.
+func (d *decodeReader) copyBytes(length, offset int) {
+	length %= d.size
+	if length < 0 {
+		length += d.size
+	}
+
+	i := (d.w - offset) % d.size
+	if i < 0 {
+		i += d.size
+	}
+	iend := i + length
+	if i > d.w {
+		if iend > d.size {
+			iend = d.size
+		}
+		n := copy(d.win[d.w:], d.win[i:iend])
+		d.w += n
+		length -= n
+		if length == 0 {
+			return
+		}
+		iend = length
+		i = 0
+	}
+	if iend <= d.w {
+		n := copy(d.win[d.w:], d.win[i:iend])
+		d.w += n
+		return
+	}
+	for length > 0 && d.w < d.size {
+		d.win[d.w] = d.win[i]
+		d.w++
+		i++
+		length--
+	}
+}
+
+// queueFilter adds a filterBlock to the end decodeReader's filters.
+func (d *decodeReader) queueFilter(f *filterBlock) error {
+	if len(d.fl) >= maxQueuedFilters {
+		return ErrTooManyFilters
+	}
+	// make offset relative to read index (from write index)
+	f.offset += d.w - d.r
+	// make offset relative to previous filter in list
+	for _, fb := range d.fl {
+		if f.offset < fb.offset {
+			// filter block must not start before previous filter
+			return ErrInvalidFilter
+		}
+		f.offset -= fb.offset
+	}
+	// offset & length must be < window size
+	f.offset %= d.size
+	if f.offset < 0 {
+		f.offset += d.size
+	}
+	f.length %= d.size
+	if f.length < 0 {
+		f.length += d.size
+	}
+	d.fl = append(d.fl, f)
+	return nil
+}
+
+func (d *decodeReader) readErr() error {
+	err := d.err
+	d.err = nil
+	return err
+}
+
+// fill the decodeReader window
+func (d *decodeReader) fill() error {
+	if d.err != nil {
+		return d.readErr()
+	}
+	if d.w == d.size {
+		// wrap to beginning of buffer
+		d.r = 0
+		d.w = 0
+	}
+	d.err = d.dec.fill(d) // fill window using decoder
+	if d.w == d.r {
+		return d.readErr()
+	}
+	return nil
+}
+
+// bufBytes returns n bytes from the window in a new buffer.
+func (d *decodeReader) bufBytes(n int) ([]byte, error) {
+	if cap(d.buf) < n {
+		d.buf = make([]byte, n)
+	}
+	// copy into buffer
+	ns := 0
+	for {
+		nn := copy(d.buf[ns:n], d.win[d.r:d.w])
+		d.r += nn
+		ns += nn
+		if ns == n {
+			break
+		}
+		if err := d.fill(); err != nil {
+			return nil, err
+		}
+	}
+	return d.buf[:n], nil
+}
+
+// processFilters processes any filters valid at the current read index
+// and returns the output in outbuf.
+func (d *decodeReader) processFilters() ([]byte, error) {
+	f := d.fl[0]
+	flen := f.length
+
+	// get filter input
+	b, err := d.bufBytes(flen)
+	if err != nil {
+		return nil, err
+	}
+	for {
+		d.fl = d.fl[1:]
+		// run filter passing buffer and total bytes read so far
+		b, err = f.filter(b, d.tot)
+		if err != nil {
+			return nil, err
+		}
+		if len(d.fl) == 0 {
+			d.fl = nil
+			return b, nil
+		}
+		// get next filter
+		f = d.fl[0]
+		if f.offset != 0 {
+			// next filter not at current offset
+			f.offset -= flen
+			return b, nil
+		}
+		if f.length != len(b) {
+			return nil, ErrInvalidFilter
+		}
+	}
+}
+
+// bytes returns a decoded byte slice or an error.
+func (d *decodeReader) bytes() ([]byte, error) {
+	// fill window if needed
+	if d.w == d.r {
+		if err := d.fill(); err != nil {
+			return nil, err
+		}
+	}
+	n := d.w - d.r
+
+	// return current unread bytes if there are no filters
+	if len(d.fl) == 0 {
+		b := d.win[d.r:d.w]
+		d.r = d.w
+		d.tot += int64(n)
+		return b, nil
+	}
+
+	// check filters
+	f := d.fl[0]
+	if f.offset < 0 {
+		return nil, ErrInvalidFilter
+	}
+	if f.offset > 0 {
+		// filter not at current read index, output bytes before it
+		n = min(f.offset, n)
+		b := d.win[d.r : d.r+n]
+		d.r += n
+		f.offset -= n
+		d.tot += int64(n)
+		return b, nil
+	}
+
+	// process filters at current index
+	b, err := d.processFilters()
+	if cap(b) > cap(d.buf) {
+		// filter returned a larger buffer, cache it
+		d.buf = b
+	}
+
+	d.tot += int64(len(b))
+	return b, err
+}
+
+// Read decodes data and stores it in p.
+func (d *decodeReader) Read(p []byte) (int, error) {
+	var err error
+	if len(d.outbuf) == 0 {
+		d.outbuf, err = d.bytes()
+		if err != nil {
+			return 0, err
+		}
+	}
+	n := copy(p, d.outbuf)
+	d.outbuf = d.outbuf[n:]
+	return n, err
+}
diff --git a/vendor/github.com/nwaples/rardecode/v2/decrypt_reader.go b/vendor/github.com/nwaples/rardecode/v2/decrypt_reader.go
new file mode 100644
index 0000000000..cea1733b32
--- /dev/null
+++ b/vendor/github.com/nwaples/rardecode/v2/decrypt_reader.go
@@ -0,0 +1,203 @@
+package rardecode
+
+import (
+	"crypto/aes"
+	"crypto/cipher"
+	"io"
+)
+
+// cipherBlockSliceReader is a sliceReader that users a cipher.BlockMode to decrypt the input.
+type cipherBlockSliceReader struct {
+	r    sliceReader
+	mode cipher.BlockMode
+	n    int // bytes encrypted but not read
+}
+
+func (c *cipherBlockSliceReader) sizeInBlocks(n int) int {
+	bs := c.mode.BlockSize()
+	if rem := n % bs; rem > 0 {
+		n += bs - rem
+	}
+	return n
+}
+
+func (c *cipherBlockSliceReader) peek(n int) ([]byte, error) {
+	bn := c.sizeInBlocks(n)
+	b, err := c.r.peek(bn)
+	if err != nil {
+		if err == io.EOF && len(b) > 0 {
+			err = io.ErrUnexpectedEOF
+		}
+		return nil, err
+	}
+	if c.n < bn {
+		c.mode.CryptBlocks(b[c.n:], b[c.n:])
+		c.n = bn
+	}
+	return b[:n], nil
+}
+
+// readSlice returns the next n bytes of decrypted input.
+// If n is not a multiple of the block size, the trailing bytes
+// of the last decrypted block will be discarded.
+func (c *cipherBlockSliceReader) readSlice(n int) ([]byte, error) {
+	bn := c.sizeInBlocks(n)
+	b, err := c.r.readSlice(bn)
+	if err != nil {
+		return nil, err
+	}
+	if c.n < bn {
+		c.mode.CryptBlocks(b[c.n:], b[c.n:])
+		c.n = 0
+	} else {
+		c.n -= bn
+	}
+	// ignore padding at end of last block
+	b = b[:n]
+	return b, nil
+}
+
+// newAesSliceReader creates a sliceReader that uses AES to decrypt the input
+func newAesSliceReader(r sliceReader, key, iv []byte) *cipherBlockSliceReader {
+	block, err := aes.NewCipher(key)
+	if err != nil {
+		panic(err)
+	}
+	mode := cipher.NewCBCDecrypter(block, iv)
+	return &cipherBlockSliceReader{r: r, mode: mode}
+}
+
+// cipherBlockReader implements Block Mode decryption of an io.Reader object.
+type cipherBlockReader struct {
+	r       byteReader
+	mode    cipher.BlockMode
+	getMode func() (cipher.BlockMode, error)
+	inbuf   []byte // raw input blocks not yet decrypted
+	outbuf  []byte // output buffer used when output slice < block size
+	block   []byte // output buffer for a single block
+}
+
+// readBlock returns a single decrypted block.
+func (cr *cipherBlockReader) readBlock() ([]byte, error) {
+	bs := len(cr.block)
+	if len(cr.inbuf) >= bs {
+		cr.mode.CryptBlocks(cr.block, cr.inbuf[:bs])
+		cr.inbuf = cr.inbuf[bs:]
+	} else {
+		n := copy(cr.block, cr.inbuf)
+		cr.inbuf = nil
+		_, err := io.ReadFull(cr.r, cr.block[n:])
+		if err != nil {
+			return nil, err
+		}
+		cr.mode.CryptBlocks(cr.block, cr.block)
+	}
+	return cr.block, nil
+}
+
+// Read reads and decrypts data into p.
+// If the input is not a multiple of the cipher block size,
+// the trailing bytes will be ignored.
+func (cr *cipherBlockReader) Read(p []byte) (int, error) {
+	if len(cr.outbuf) > 0 {
+		n := copy(p, cr.outbuf)
+		cr.outbuf = cr.outbuf[n:]
+		return n, nil
+	}
+	// get input blocks
+	for len(cr.inbuf) == 0 {
+		var err error
+		cr.inbuf, err = cr.r.bytes()
+		if err != nil {
+			return 0, err
+		}
+	}
+	if cr.mode == nil {
+		var err error
+		cr.mode, err = cr.getMode()
+		if err != nil {
+			return 0, err
+		}
+		cr.block = make([]byte, cr.mode.BlockSize())
+	}
+	bs := cr.mode.BlockSize()
+	n := len(cr.inbuf)
+	l := len(p)
+	if n < bs || l < bs {
+		// Next encrypted block spans volumes or Read buffer is too small
+		// to fit a single block. Decrypt a single block and store the
+		// leftover in outbuf.
+		b, err := cr.readBlock()
+		if err != nil {
+			return 0, err
+		}
+		n = copy(p, b)
+		cr.outbuf = b[n:]
+		return n, nil
+	}
+	// output buffer smaller than input
+	n = min(l, n)
+	// round down to block size
+	n -= n % bs
+	cr.mode.CryptBlocks(p[:n], cr.inbuf[:n])
+	cr.inbuf = cr.inbuf[n:]
+	return n, nil
+}
+
+// bytes returns a byte slice of decrypted data.
+func (cr *cipherBlockReader) bytes() ([]byte, error) {
+	if len(cr.outbuf) > 0 {
+		b := cr.outbuf
+		cr.outbuf = nil
+		return b, nil
+	}
+	// get more input
+	for len(cr.inbuf) == 0 {
+		var err error
+		cr.inbuf, err = cr.r.bytes()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if cr.mode == nil {
+		var err error
+		cr.mode, err = cr.getMode()
+		if err != nil {
+			return nil, err
+		}
+		cr.block = make([]byte, cr.mode.BlockSize())
+	}
+	bs := cr.mode.BlockSize()
+	if len(cr.inbuf) < bs {
+		// next encrypted block spans volumes
+		return cr.readBlock()
+	}
+	n := len(cr.inbuf)
+	n -= n % bs
+	// get input buffer and round down to nearest block boundary
+	b := cr.inbuf[:n]
+	cr.inbuf = cr.inbuf[n:]
+	cr.mode.CryptBlocks(b, b)
+	return b, nil
+}
+
+func newCipherBlockReader(r byteReader, getMode func() (cipher.BlockMode, error)) *cipherBlockReader {
+	c := &cipherBlockReader{r: r, getMode: getMode}
+	return c
+}
+
+// newAesDecryptReader returns a cipherBlockReader that decrypts input from a given io.Reader using AES.
+func newAesDecryptReader(r byteReader, h *fileBlockHeader) *cipherBlockReader {
+	getMode := func() (cipher.BlockMode, error) {
+		err := h.genKeys()
+		if err != nil {
+			return nil, err
+		}
+		block, err := aes.NewCipher(h.key)
+		if err != nil {
+			return nil, err
+		}
+		return cipher.NewCBCDecrypter(block, h.iv), nil
+	}
+	return newCipherBlockReader(r, getMode)
+}
diff --git a/vendor/github.com/nwaples/rardecode/filters.go b/vendor/github.com/nwaples/rardecode/v2/filters.go
similarity index 97%
rename from vendor/github.com/nwaples/rardecode/filters.go
rename to vendor/github.com/nwaples/rardecode/v2/filters.go
index a9eb0407d9..57e4e4cc4b 100644
--- a/vendor/github.com/nwaples/rardecode/filters.go
+++ b/vendor/github.com/nwaples/rardecode/v2/filters.go
@@ -1,10 +1,10 @@
 package rardecode
 
 import (
-	"bytes"
 	"encoding/binary"
 	"hash/crc32"
 	"io"
+	"math"
 )
 
 const (
@@ -13,8 +13,6 @@ const (
 	vmGlobalAddr      = 0x3C000
 	vmGlobalSize      = 0x02000
 	vmFixedGlobalSize = 0x40
-
-	maxUint32 = 1<<32 - 1
 )
 
 // v3Filter is the interface type for RAR V3 filters.
@@ -79,12 +77,12 @@ func e8e9FilterV3(r map[int]uint32, global, buf []byte, offset int64) ([]byte, e
 func getBits(buf []byte, pos, count uint) uint32 {
 	n := binary.LittleEndian.Uint32(buf[pos/8:])
 	n >>= pos & 7
-	mask := uint32(maxUint32) >> (32 - count)
+	mask := uint32(math.MaxUint32) >> (32 - count)
 	return n & mask
 }
 
 func setBits(buf []byte, pos, count uint, bits uint32) {
-	mask := uint32(maxUint32) >> (32 - count)
+	mask := uint32(math.MaxUint32) >> (32 - count)
 	mask <<= pos & 7
 	bits <<= pos & 7
 	n := binary.LittleEndian.Uint32(buf[pos/8:])
@@ -297,7 +295,7 @@ type vmFilter struct {
 // execute implements v3filter type for VM based RAR 3 filters.
 func (f *vmFilter) execute(r map[int]uint32, global, buf []byte, offset int64) ([]byte, error) {
 	if len(buf) > vmGlobalAddr {
-		return buf, errInvalidFilter
+		return buf, ErrInvalidFilter
 	}
 	v := newVM(buf)
 
@@ -388,7 +386,7 @@ func getV3Filter(code []byte) (v3Filter, error) {
 
 	// create new vm filter
 	f := new(vmFilter)
-	r := newRarBitReader(bytes.NewReader(code[1:])) // skip first xor byte check
+	r := newRarBitReader(newBufByteReader(code[1:])) // skip first xor byte check
 
 	// read static data
 	n, err := r.readBits(1)
@@ -396,7 +394,8 @@ func getV3Filter(code []byte) (v3Filter, error) {
 		return nil, err
 	}
 	if n > 0 {
-		m, err := r.readUint32()
+		var m uint32
+		m, err = r.readUint32()
 		if err != nil {
 			return nil, err
 		}
diff --git a/vendor/github.com/nwaples/rardecode/huffman.go b/vendor/github.com/nwaples/rardecode/v2/huffman.go
similarity index 73%
rename from vendor/github.com/nwaples/rardecode/huffman.go
rename to vendor/github.com/nwaples/rardecode/v2/huffman.go
index 4acb69d5a9..77186850ac 100644
--- a/vendor/github.com/nwaples/rardecode/huffman.go
+++ b/vendor/github.com/nwaples/rardecode/v2/huffman.go
@@ -12,22 +12,22 @@ const (
 )
 
 var (
-	errHuffDecodeFailed   = errors.New("rardecode: huffman decode failed")
-	errInvalidLengthTable = errors.New("rardecode: invalid huffman code length table")
+	ErrHuffDecodeFailed   = errors.New("rardecode: huffman decode failed")
+	ErrInvalidLengthTable = errors.New("rardecode: invalid huffman code length table")
 )
 
 type huffmanDecoder struct {
-	limit     [maxCodeLength + 1]int
-	pos       [maxCodeLength + 1]int
-	symbol    []int
-	min       uint
-	quickbits uint
-	quicklen  [maxQuickSize]uint
-	quicksym  [maxQuickSize]int
+	limit     [maxCodeLength + 1]uint16
+	pos       [maxCodeLength + 1]uint16
+	symbol    []uint16
+	min       uint8
+	quickbits uint8
+	quicklen  [maxQuickSize]uint8
+	quicksym  [maxQuickSize]uint16
 }
 
 func (h *huffmanDecoder) init(codeLengths []byte) {
-	var count [maxCodeLength + 1]int
+	count := make([]uint16, maxCodeLength+1)
 
 	for _, n := range codeLengths {
 		if n == 0 {
@@ -39,7 +39,7 @@ func (h *huffmanDecoder) init(codeLengths []byte) {
 	h.pos[0] = 0
 	h.limit[0] = 0
 	h.min = 0
-	for i := uint(1); i <= maxCodeLength; i++ {
+	for i := uint8(1); i <= maxCodeLength; i++ {
 		h.limit[i] = h.limit[i-1] + count[i]<<(maxCodeLength-i)
 		h.pos[i] = h.pos[i-1] + count[i-1]
 		if h.min == 0 && h.limit[i] > 0 {
@@ -49,17 +49,15 @@ func (h *huffmanDecoder) init(codeLengths []byte) {
 
 	if cap(h.symbol) >= len(codeLengths) {
 		h.symbol = h.symbol[:len(codeLengths)]
-		for i := range h.symbol {
-			h.symbol[i] = 0
-		}
+		clear(h.symbol)
 	} else {
-		h.symbol = make([]int, len(codeLengths))
+		h.symbol = make([]uint16, len(codeLengths))
 	}
 
-	copy(count[:], h.pos[:])
+	copy(count, h.pos[:])
 	for i, n := range codeLengths {
 		if n != 0 {
-			h.symbol[count[n]] = i
+			h.symbol[count[n]] = uint16(i)
 			count[n]++
 		}
 	}
@@ -70,8 +68,8 @@ func (h *huffmanDecoder) init(codeLengths []byte) {
 		h.quickbits = maxQuickBits - 3
 	}
 
-	bits := uint(1)
-	for i := 0; i < 1<<h.quickbits; i++ {
+	bits := uint8(1)
+	for i := uint16(0); i < 1<<h.quickbits; i++ {
 		v := i << (maxCodeLength - h.quickbits)
 
 		for v >= h.limit[bits] && bits < maxCodeLength {
@@ -82,7 +80,7 @@ func (h *huffmanDecoder) init(codeLengths []byte) {
 		dist := v - h.limit[bits-1]
 		dist >>= (maxCodeLength - bits)
 
-		pos := h.pos[bits] + dist
+		pos := int(h.pos[bits]) + int(dist)
 		if pos < len(h.symbol) {
 			h.quicksym[i] = h.symbol[pos]
 		} else {
@@ -92,34 +90,34 @@ func (h *huffmanDecoder) init(codeLengths []byte) {
 }
 
 func (h *huffmanDecoder) readSym(r bitReader) (int, error) {
-	bits := uint(maxCodeLength)
-	v, err := r.readBits(maxCodeLength)
+	var bits uint8
+	var v uint16
+	n, err := r.readBits(maxCodeLength)
 	if err != nil {
 		if err != io.EOF {
 			return 0, err
 		}
 		// fall back to 1 bit at a time if we read past EOF
-		for i := uint(1); i <= maxCodeLength; i++ {
+		for bits = 1; bits <= maxCodeLength; bits++ {
 			b, err := r.readBits(1)
 			if err != nil {
 				return 0, err // not enough bits return error
 			}
-			v |= b << (maxCodeLength - i)
-			if v < h.limit[i] {
-				bits = i
+			v |= uint16(b) << (maxCodeLength - bits)
+			if v < h.limit[bits] {
 				break
 			}
 		}
 	} else {
+		v = uint16(n)
 		if v < h.limit[h.quickbits] {
 			i := v >> (maxCodeLength - h.quickbits)
 			r.unreadBits(maxCodeLength - h.quicklen[i])
-			return h.quicksym[i], nil
+			return int(h.quicksym[i]), nil
 		}
 
-		for i, n := range h.limit[h.min:] {
-			if v < n {
-				bits = h.min + uint(i)
+		for bits = h.min; bits < maxCodeLength; bits++ {
+			if v < h.limit[bits] {
 				r.unreadBits(maxCodeLength - bits)
 				break
 			}
@@ -129,12 +127,12 @@ func (h *huffmanDecoder) readSym(r bitReader) (int, error) {
 	dist := v - h.limit[bits-1]
 	dist >>= maxCodeLength - bits
 
-	pos := h.pos[bits] + dist
+	pos := int(h.pos[bits]) + int(dist)
 	if pos >= len(h.symbol) {
-		return 0, errHuffDecodeFailed
+		return 0, ErrHuffDecodeFailed
 	}
 
-	return h.symbol[pos], nil
+	return int(h.symbol[pos]), nil
 }
 
 // readCodeLengthTable reads a new code length table into codeLength from br.
@@ -194,7 +192,7 @@ func readCodeLengthTable(br bitReader, codeLength []byte, addOld bool) error {
 		}
 		if l < 18 {
 			if i == 0 {
-				return errInvalidLengthTable
+				return ErrInvalidLengthTable
 			}
 			value = codeLength[i-1]
 		}
diff --git a/vendor/github.com/nwaples/rardecode/ppm_model.go b/vendor/github.com/nwaples/rardecode/v2/ppm_model.go
similarity index 98%
rename from vendor/github.com/nwaples/rardecode/ppm_model.go
rename to vendor/github.com/nwaples/rardecode/v2/ppm_model.go
index fd55a74145..b1bddee678 100644
--- a/vendor/github.com/nwaples/rardecode/ppm_model.go
+++ b/vendor/github.com/nwaples/rardecode/v2/ppm_model.go
@@ -3,6 +3,7 @@ package rardecode
 import (
 	"errors"
 	"io"
+	"math"
 )
 
 const (
@@ -26,12 +27,11 @@ const (
 	// A unit can store one context or two states.
 	unitSize = 12
 
-	maxUint16 = 1<<16 - 1
-	freeMark  = -1
+	freeMark = -1
 )
 
 var (
-	errCorruptPPM = errors.New("rardecode: corrupt ppm data")
+	ErrCorruptPPM = errors.New("rardecode: corrupt ppm data")
 
 	expEscape  = []byte{25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2}
 	initBinEsc = []uint16{0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051}
@@ -251,9 +251,7 @@ func (a *subAllocator) restart() {
 	a.heap2Lo = a.heap1Hi / unitSize * 2
 	a.heap2Hi = int32(len(a.states))
 	a.glueCount = 0
-	for i := range a.freeList {
-		a.freeList[i] = 0
-	}
+	clear(a.freeList[:])
 }
 
 // pushByte puts a byte on the heap and returns a state.succ index that
@@ -354,7 +352,7 @@ func (a *subAllocator) glueFreeBlocks() {
 		states := a.states[i+u<<1:]
 		for len(states) > 0 && states[0].succ == freeMark {
 			u += int32(states[0].uint16())
-			if u > maxUint16 {
+			if u > math.MaxUint16 {
 				break
 			}
 			states[0].succ = 0
@@ -564,9 +562,7 @@ type model struct {
 }
 
 func (m *model) restart() {
-	for i := range m.charMask {
-		m.charMask[i] = 0
-	}
+	clear(m.charMask[:])
 	m.escCount = 1
 
 	if m.maxOrder < 12 {
@@ -616,7 +612,7 @@ func (m *model) init(br io.ByteReader, reset bool, maxOrder, maxMB int) error {
 	m.a.init(maxMB)
 
 	if maxOrder == 1 {
-		return errCorruptPPM
+		return ErrCorruptPPM
 	}
 	m.maxOrder = maxOrder
 	m.prevSym = 0
@@ -718,7 +714,7 @@ func (m *model) decodeSymbol1(c context) (*state, error) {
 	// protect against divide by zero
 	// TODO: look at why this happens, may be problem elsewhere
 	if scale == 0 {
-		return nil, errCorruptPPM
+		return nil, ErrCorruptPPM
 	}
 	count := m.rc.currentCount(scale)
 	m.prevSuccess = 0
@@ -799,7 +795,7 @@ func (m *model) decodeSymbol2(c context, numMasked int) (*state, error) {
 	count := m.rc.currentCount(scale)
 
 	if count >= scale {
-		return nil, errCorruptPPM
+		return nil, ErrCorruptPPM
 	}
 	if count >= hi {
 		err := m.rc.decode(hi, scale)
@@ -900,9 +896,7 @@ func (m *model) update(minC, maxC context, s *state) context {
 
 	if m.escCount == 0 {
 		m.escCount = 1
-		for i := range m.charMask {
-			m.charMask[i] = 0
-		}
+		clear(m.charMask[:])
 	}
 
 	var ss *state // matching minC.suffix state
@@ -1044,7 +1038,7 @@ func (m *model) ReadByte() (byte, error) {
 			m.orderFall++
 			minC = m.a.contextSuffix(minC)
 			if minC <= 0 {
-				return 0, errCorruptPPM
+				return 0, ErrCorruptPPM
 			}
 		}
 		s, err = m.decodeSymbol2(minC, n)
diff --git a/vendor/github.com/nwaples/rardecode/v2/reader.go b/vendor/github.com/nwaples/rardecode/v2/reader.go
new file mode 100644
index 0000000000..e891ce5419
--- /dev/null
+++ b/vendor/github.com/nwaples/rardecode/v2/reader.go
@@ -0,0 +1,536 @@
+package rardecode
+
+import (
+	"bufio"
+	"bytes"
+	"crypto/hmac"
+	"crypto/sha256"
+	"errors"
+	"hash"
+	"io"
+	"math"
+	"os"
+	"time"
+)
+
+// FileHeader HostOS types
+const (
+	HostOSUnknown = 0
+	HostOSMSDOS   = 1
+	HostOSOS2     = 2
+	HostOSWindows = 3
+	HostOSUnix    = 4
+	HostOSMacOS   = 5
+	HostOSBeOS    = 6
+)
+
+const (
+	maxPassword = int(128)
+)
+
+var (
+	ErrShortFile        = errors.New("rardecode: decoded file too short")
+	ErrInvalidFileBlock = errors.New("rardecode: invalid file block")
+	ErrUnexpectedArcEnd = errors.New("rardecode: unexpected end of archive")
+	ErrBadFileChecksum  = errors.New("rardecode: bad file checksum")
+	ErrSolidOpen        = errors.New("rardecode: solid files don't support Open")
+	ErrUnknownVersion   = errors.New("rardecode: unknown archive version")
+)
+
+// FileHeader represents a single file in a RAR archive.
+type FileHeader struct {
+	Name             string    // file name using '/' as the directory separator
+	IsDir            bool      // is a directory
+	Solid            bool      // is a solid file
+	Encrypted        bool      // file contents are encrypted
+	HeaderEncrypted  bool      // file header is encrypted
+	HostOS           byte      // Host OS the archive was created on
+	Attributes       int64     // Host OS specific file attributes
+	PackedSize       int64     // packed file size (or first block if the file spans volumes)
+	UnPackedSize     int64     // unpacked file size
+	UnKnownSize      bool      // unpacked file size is not known
+	ModificationTime time.Time // modification time (non-zero if set)
+	CreationTime     time.Time // creation time (non-zero if set)
+	AccessTime       time.Time // access time (non-zero if set)
+	Version          int       // file version
+}
+
+// Mode returns an os.FileMode for the file, calculated from the Attributes field.
+func (f *FileHeader) Mode() os.FileMode {
+	var m os.FileMode
+
+	if f.IsDir {
+		m = os.ModeDir
+	}
+	if f.HostOS == HostOSWindows {
+		if f.IsDir {
+			m |= 0777
+		} else if f.Attributes&1 > 0 {
+			m |= 0444 // readonly
+		} else {
+			m |= 0666
+		}
+		return m
+	}
+	// assume unix perms for all remaining os types
+	m |= os.FileMode(f.Attributes) & os.ModePerm
+
+	// only check other bits on unix host created archives
+	if f.HostOS != HostOSUnix {
+		return m
+	}
+
+	if f.Attributes&0x200 != 0 {
+		m |= os.ModeSticky
+	}
+	if f.Attributes&0x400 != 0 {
+		m |= os.ModeSetgid
+	}
+	if f.Attributes&0x800 != 0 {
+		m |= os.ModeSetuid
+	}
+
+	// Check for additional file types.
+	if f.Attributes&0xF000 == 0xA000 {
+		m |= os.ModeSymlink
+	}
+	return m
+}
+
+type byteReader interface {
+	io.Reader
+	bytes() ([]byte, error)
+}
+
+type bufByteReader struct {
+	buf []byte
+}
+
+func (b *bufByteReader) Read(p []byte) (int, error) {
+	if len(b.buf) == 0 {
+		return 0, io.EOF
+	}
+	n := copy(p, b.buf)
+	b.buf = b.buf[n:]
+	return n, nil
+}
+
+func (b *bufByteReader) bytes() ([]byte, error) {
+	if len(b.buf) == 0 {
+		return nil, io.EOF
+	}
+	buf := b.buf
+	b.buf = nil
+	return buf, nil
+}
+
+func newBufByteReader(buf []byte) *bufByteReader {
+	return &bufByteReader{buf: buf}
+}
+
+// packedFileReader provides sequential access to packed files in a RAR archive.
+type packedFileReader struct {
+	n int64 // bytes left in current data block
+	v *volume
+	r fileBlockReader
+	h *fileBlockHeader // current file header
+}
+
+// init initializes a cloned packedFileReader
+func (f *packedFileReader) init() error { return f.v.init() }
+
+func (f *packedFileReader) clone() *packedFileReader {
+	nr := &packedFileReader{n: f.n, h: f.h}
+	nr.r = f.r.clone()
+	nr.v = f.v.clone()
+	return nr
+}
+
+func (f *packedFileReader) Close() error { return f.v.Close() }
+
+// nextBlock reads the next file block in the current file at the current
+// archive file position, or returns an error if there is a problem.
+// It is invalid to call this when already at the last block in the current file.
+func (f *packedFileReader) nextBlock() error {
+	if f.h == nil {
+		return io.EOF
+	}
+	// discard current block data
+	if f.n > 0 {
+		if err := f.v.discard(f.n); err != nil {
+			return err
+		}
+		f.n = 0
+	}
+	if f.h.last {
+		return io.EOF
+	}
+	h, err := f.r.next(f.v)
+	if err != nil {
+		if err == io.EOF {
+			// archive ended, but file hasn't
+			return ErrUnexpectedArcEnd
+		}
+		return err
+	}
+	if h.first || h.Name != f.h.Name {
+		return ErrInvalidFileBlock
+	}
+	f.n = h.PackedSize
+	f.h = h
+	return nil
+}
+
+// next advances to the next packed file in the RAR archive.
+func (f *packedFileReader) next() (*fileBlockHeader, error) {
+	// skip to last block in current file
+	var err error
+	for err == nil {
+		err = f.nextBlock()
+	}
+	if err != io.EOF {
+		return nil, err
+	}
+	f.h, err = f.r.next(f.v) // get next file block
+	if err != nil {
+		return nil, err
+	}
+	if !f.h.first {
+		return nil, ErrInvalidFileBlock
+	}
+	f.n = f.h.PackedSize
+	return f.h, nil
+}
+
+// Read reads the packed data for the current file into p.
+func (f *packedFileReader) Read(p []byte) (int, error) {
+	for f.n == 0 {
+		if err := f.nextBlock(); err != nil {
+			return 0, err
+		}
+	}
+	if int64(len(p)) > f.n {
+		p = p[0:f.n]
+	}
+	n, err := f.v.Read(p)
+	f.n -= int64(n)
+	if err == io.EOF && f.n > 0 {
+		return n, io.ErrUnexpectedEOF
+	}
+	if n > 0 {
+		return n, nil
+	}
+	return n, err
+}
+
+func (f *packedFileReader) bytes() ([]byte, error) {
+	for f.n == 0 {
+		if err := f.nextBlock(); err != nil {
+			return nil, err
+		}
+	}
+	n := int(min(f.n, math.MaxInt))
+	if k := f.v.br.Buffered(); k > 0 {
+		n = min(k, n)
+	} else {
+		b, err := f.v.peek(n)
+		if err != nil && err != bufio.ErrBufferFull {
+			return nil, err
+		}
+		n = len(b)
+	}
+	b, err := f.v.readSlice(n)
+	f.n -= int64(len(b))
+	return b, err
+}
+
+func newPackedFileReader(r io.Reader, opts []Option) (*packedFileReader, error) {
+	v, err := newVolume(r, opts)
+	if err != nil {
+		return nil, err
+	}
+	fbr, err := newFileBlockReader(v)
+	if err != nil {
+		return nil, err
+	}
+	return &packedFileReader{r: fbr, v: v}, nil
+}
+
+func openPackedFileReader(name string, opts []Option) (*packedFileReader, error) {
+	v, err := openVolume(name, opts)
+	if err != nil {
+		return nil, err
+	}
+	fbr, err := newFileBlockReader(v)
+	if err != nil {
+		return nil, err
+	}
+	return &packedFileReader{r: fbr, v: v}, nil
+}
+
+type limitedReader struct {
+	r        byteReader
+	n        int64 // bytes remaining
+	shortErr error // error returned when r returns io.EOF with n > 0
+}
+
+func (l *limitedReader) Read(p []byte) (int, error) {
+	if l.n <= 0 {
+		return 0, io.EOF
+	}
+	if int64(len(p)) > l.n {
+		p = p[0:l.n]
+	}
+	n, err := l.r.Read(p)
+	l.n -= int64(n)
+	if err == io.EOF && l.n > 0 {
+		return n, l.shortErr
+	}
+	return n, err
+}
+
+func (l *limitedReader) bytes() ([]byte, error) {
+	b, err := l.r.bytes()
+	if n := len(b); int64(n) > l.n {
+		b = b[:int(l.n)]
+	}
+	l.n -= int64(len(b))
+	return b, err
+}
+
+type checksumReader struct {
+	r    byteReader
+	hash hash.Hash
+	pr   *packedFileReader
+}
+
+func (cr *checksumReader) eofError() error {
+	// calculate file checksum
+	h := cr.pr.h
+	sum := cr.hash.Sum(nil)
+	if !h.first && h.genKeys != nil {
+		if err := h.genKeys(); err != nil {
+			return err
+		}
+	}
+	if len(h.hashKey) > 0 {
+		mac := hmac.New(sha256.New, h.hashKey)
+		_, _ = mac.Write(sum) // ignore error, should always succeed
+		sum = mac.Sum(sum[:0])
+		if len(h.sum) == 4 {
+			// CRC32
+			for i, v := range sum[4:] {
+				sum[i&3] ^= v
+			}
+			sum = sum[:4]
+		}
+	}
+	if !bytes.Equal(sum, h.sum) {
+		return ErrBadFileChecksum
+	}
+	return io.EOF
+}
+
+func (cr *checksumReader) Read(p []byte) (int, error) {
+	n, err := cr.r.Read(p)
+	if n > 0 {
+		if n, err = cr.hash.Write(p[:n]); err != nil {
+			return n, err
+		}
+	}
+	if err != io.EOF {
+		return n, err
+	}
+	return n, cr.eofError()
+}
+
+func (cr *checksumReader) bytes() ([]byte, error) {
+	b, err := cr.r.bytes()
+	if len(b) > 0 {
+		if _, err = cr.hash.Write(b); err != nil {
+			return b, err
+		}
+	}
+	if err != io.EOF {
+		return b, err
+	}
+	return b, cr.eofError()
+}
+
+// Reader provides sequential access to files in a RAR archive.
+type Reader struct {
+	r  byteReader        // reader for current unpacked file
+	dr *decodeReader     // reader for decoding and filters if file is compressed
+	pr *packedFileReader // reader for current raw file bytes
+}
+
+// Read reads from the current file in the RAR archive.
+func (r *Reader) Read(p []byte) (int, error) {
+	if r.r == nil {
+		err := r.nextFile()
+		if err != nil {
+			return 0, err
+		}
+	}
+	return r.r.Read(p)
+}
+
+// WriteTo implements io.WriterTo.
+func (r *Reader) WriteTo(w io.Writer) (int64, error) {
+	if r.r == nil {
+		err := r.nextFile()
+		if err != nil {
+			return 0, err
+		}
+	}
+	var n int64
+	b, err := r.r.bytes()
+	for err == nil {
+		var nn int
+		nn, err = w.Write(b)
+		n += int64(nn)
+		if err == nil {
+			b, err = r.r.bytes()
+		}
+	}
+	if err == io.EOF {
+		err = nil
+	}
+	return n, err
+}
+
+// Next advances to the next file in the archive.
+func (r *Reader) Next() (*FileHeader, error) {
+	// check if file is a compressed file in a solid archive
+	if h := r.pr.h; h != nil && h.decVer > 0 && h.arcSolid {
+		var err error
+		if r.r == nil {
+			// setup full file reader
+			err = r.nextFile()
+		}
+		// decode and discard bytes
+		for err == nil {
+			_, err = r.dr.bytes()
+		}
+		if err != io.EOF {
+			return nil, err
+		}
+	}
+	// get next packed file
+	h, err := r.pr.next()
+	if err != nil {
+		return nil, err
+	}
+	// Clear the reader as it will be setup on the next Read() or WriteTo().
+	r.r = nil
+	return &h.FileHeader, nil
+}
+
+func (r *Reader) nextFile() error {
+	h := r.pr.h
+	if h == nil {
+		return io.EOF
+	}
+	// start with packed file reader
+	r.r = r.pr
+	// check for encryption
+	if h.genKeys != nil {
+		r.r = newAesDecryptReader(r.pr, h) // decrypt
+	}
+	// check for compression
+	if h.decVer > 0 {
+		if r.dr == nil {
+			r.dr = new(decodeReader)
+		}
+		err := r.dr.init(r.r, h.decVer, h.winSize, !h.Solid, h.UnPackedSize)
+		if err != nil {
+			return err
+		}
+		r.r = r.dr
+	}
+	if h.UnPackedSize >= 0 && !h.UnKnownSize {
+		// Limit reading to UnPackedSize as there may be padding
+		r.r = &limitedReader{r.r, h.UnPackedSize, ErrShortFile}
+	}
+	if h.hash != nil {
+		r.r = &checksumReader{r.r, h.hash(), r.pr}
+	}
+	return nil
+}
+
+// NewReader creates a Reader reading from r.
+// NewReader only supports single volume archives.
+// Multi-volume archives must use OpenReader.
+func NewReader(r io.Reader, opts ...Option) (*Reader, error) {
+	pr, err := newPackedFileReader(r, opts)
+	if err != nil {
+		return nil, err
+	}
+	return &Reader{pr: pr}, nil
+}
+
+// ReadCloser is a Reader that allows closing of the rar archive.
+type ReadCloser struct {
+	Reader
+}
+
+// Close closes the rar file.
+func (rc *ReadCloser) Close() error {
+	return rc.pr.Close()
+}
+
+// OpenReader opens a RAR archive specified by the name and returns a ReadCloser.
+func OpenReader(name string, opts ...Option) (*ReadCloser, error) {
+	pr, err := openPackedFileReader(name, opts)
+	if err != nil {
+		return nil, err
+	}
+	return &ReadCloser{Reader{pr: pr}}, nil
+}
+
+// File represents a file in a RAR archive
+type File struct {
+	FileHeader
+	pr *packedFileReader
+}
+
+// Open returns an io.ReadCloser that provides access to the File's contents.
+// Open is not supported on Solid File's as their contents depend on the decoding
+// of the preceding files in the archive. Use OpenReader and Next to access Solid file
+// contents instead.
+func (f *File) Open() (io.ReadCloser, error) {
+	if f.Solid {
+		return nil, ErrSolidOpen
+	}
+	r := new(ReadCloser)
+	r.pr = f.pr.clone()
+	return r, r.pr.init()
+}
+
+// List returns a list of File's in the RAR archive specified by name.
+func List(name string, opts ...Option) ([]*File, error) {
+	r, err := OpenReader(name, opts...)
+	if err != nil {
+		return nil, err
+	}
+	pr := r.pr
+	defer pr.Close()
+
+	var fl []*File
+	for {
+		// get next file
+		h, err := pr.next()
+		if err != nil {
+			if err == io.EOF {
+				return fl, nil
+			}
+			return nil, err
+		}
+
+		// save information for File
+		f := new(File)
+		f.FileHeader = h.FileHeader
+		f.pr = pr.clone()
+		fl = append(fl, f)
+	}
+}
diff --git a/vendor/github.com/nwaples/rardecode/vm.go b/vendor/github.com/nwaples/rardecode/v2/vm.go
similarity index 97%
rename from vendor/github.com/nwaples/rardecode/vm.go
rename to vendor/github.com/nwaples/rardecode/v2/vm.go
index fd26a5a0ae..f6844207de 100644
--- a/vendor/github.com/nwaples/rardecode/vm.go
+++ b/vendor/github.com/nwaples/rardecode/v2/vm.go
@@ -19,7 +19,7 @@ const (
 )
 
 var (
-	errInvalidVMInstruction = errors.New("rardecode: invalid vm instruction")
+	ErrInvalidVMInstruction = errors.New("rardecode: invalid vm instruction")
 )
 
 type vm struct {
@@ -63,8 +63,8 @@ func newVM(mem []byte) *vm {
 		copy(v.m, mem)
 	} else {
 		v.m = mem[:vmSize+4]
-		for i := len(mem); i < len(v.m); i++ {
-			v.m[i] = 0
+		if l := len(mem); l < len(v.m) {
+			clear(v.m[l:])
 		}
 	}
 	v.r[7] = vmSize
@@ -585,7 +585,8 @@ func decodeArg(br *rarBitReader, byteMode bool) (operand, error) {
 		if byteMode {
 			n, err = br.readBits(8)
 		} else {
-			m, err := br.readUint32()
+			var m uint32
+			m, err = br.readUint32()
 			return opI(m), err
 		}
 		return opI(n), err
@@ -609,7 +610,8 @@ func decodeArg(br *rarBitReader, byteMode bool) (operand, error) {
 		if err != nil {
 			return nil, err
 		}
-		i, err := br.readUint32()
+		var i uint32
+		i, err = br.readUint32()
 		return opBI{r: uint32(n), i: i}, err
 	}
 	// Direct addressing
@@ -644,7 +646,8 @@ func readCommands(br *rarBitReader) ([]command, error) {
 			return cmds, err
 		}
 		if code&0x08 > 0 {
-			n, err := br.readBits(2)
+			var n int
+			n, err = br.readBits(2)
 			if err != nil {
 				return cmds, err
 			}
@@ -652,14 +655,15 @@ func readCommands(br *rarBitReader) ([]command, error) {
 		}
 
 		if code >= len(ops) {
-			return cmds, errInvalidVMInstruction
+			return cmds, ErrInvalidVMInstruction
 		}
 		ins := ops[code]
 
 		var com command
 
 		if ins.byteMode {
-			n, err := br.readBits(1)
+			var n int
+			n, err = br.readBits(1)
 			if err != nil {
 				return cmds, err
 			}
diff --git a/vendor/github.com/nwaples/rardecode/v2/volume.go b/vendor/github.com/nwaples/rardecode/v2/volume.go
new file mode 100644
index 0000000000..8d2d665198
--- /dev/null
+++ b/vendor/github.com/nwaples/rardecode/v2/volume.go
@@ -0,0 +1,408 @@
+package rardecode
+
+import (
+	"bufio"
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"math"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+const (
+	maxSfxSize = 0x100000 // maximum number of bytes to read when searching for RAR signature
+	sigPrefix  = "Rar!\x1A\x07"
+)
+
+var (
+	ErrNoSig            = errors.New("rardecode: RAR signature not found")
+	ErrVerMismatch      = errors.New("rardecode: volume version mistmatch")
+	ErrArchiveNameEmpty = errors.New("rardecode: archive name empty")
+	ErrFileNameRequired = errors.New("rardecode: filename required for multi volume archive")
+)
+
+type option struct {
+	bsize int     // size to be use for bufio.Reader
+	fs    fs.FS   // filesystem to use to open files
+	pass  *string // password for encrypted volumes
+}
+
+// An Option is used for optional archive extraction settings.
+type Option func(*option)
+
+// BufferSize sets the size of the bufio.Reader used in reading the archive.
+func BufferSize(size int) Option {
+	return func(o *option) { o.bsize = size }
+}
+
+// FileSystem sets the fs.FS to be used for opening archive volumes.
+func FileSystem(fs fs.FS) Option {
+	return func(o *option) { o.fs = fs }
+}
+
+// Password sets the password to use for decrypting archives.
+func Password(pass string) Option {
+	return func(o *option) { o.pass = &pass }
+}
+
+// volume extends a fileBlockReader to be used across multiple
+// files in a multi-volume archive
+type volume struct {
+	f    io.Reader     // current file handle
+	br   *bufio.Reader // buffered reader for current volume file
+	dir  string        // current volume directory path
+	file string        // current volume file name
+	num  int           // volume number
+	old  bool          // uses old naming scheme
+	off  int64         // current file offset
+	ver  int           // archive file format version
+	opt  option        // optional settings
+}
+
+func (v *volume) setOpts(opts []Option) {
+	for _, f := range opts {
+		f(&v.opt)
+	}
+}
+
+func (v *volume) setBuffer() {
+	if v.br != nil {
+		v.br.Reset(v.f)
+	} else if size := v.opt.bsize; size > 0 {
+		v.br = bufio.NewReaderSize(v.f, size)
+	} else if br, ok := v.f.(*bufio.Reader); ok {
+		v.br = br
+	} else {
+		v.br = bufio.NewReader(v.f)
+	}
+}
+
+func (v *volume) openFile(file string) error {
+	var err error
+	var f io.Reader
+
+	if len(file) == 0 {
+		return ErrArchiveNameEmpty
+	}
+	if fs := v.opt.fs; fs != nil {
+		f, err = fs.Open(v.dir + file)
+	} else {
+		f, err = os.Open(v.dir + file)
+	}
+	if err != nil {
+		return err
+	}
+	v.f = f
+	v.file = file
+	v.setBuffer()
+	return nil
+}
+
+func (v *volume) init() error {
+	err := v.openFile(v.file)
+	if err != nil {
+		return err
+	}
+	err = v.discard(v.off)
+	if err != nil {
+		_ = v.Close()
+	}
+	return err
+}
+
+func (v *volume) clone() *volume {
+	nv := new(volume)
+	*nv = *v
+	nv.f = nil
+	nv.br = nil
+	return nv
+}
+
+func (v *volume) Close() error {
+	// v.f may be nil if os.Open fails in next().
+	// We only close if we opened it (ie. v.name provided).
+	if v.f != nil && len(v.file) > 0 {
+		if c, ok := v.f.(io.Closer); ok {
+			err := c.Close()
+			v.f = nil // set to nil so we can only close v.f once
+			return err
+		}
+	}
+	return nil
+}
+
+func (v *volume) discard(n int64) error {
+	var err error
+	v.off += n
+	l := int64(v.br.Buffered())
+	if n <= l {
+		_, err = v.br.Discard(int(n))
+	} else if sr, ok := v.f.(io.Seeker); ok {
+		n -= l
+		_, err = sr.Seek(n, io.SeekCurrent)
+		v.br.Reset(v.f)
+	} else {
+		for n > math.MaxInt && err == nil {
+			_, err = v.br.Discard(math.MaxInt)
+			n -= math.MaxInt
+		}
+		if err == nil && n > 0 {
+			_, err = v.br.Discard(int(n))
+		}
+	}
+	if err == io.EOF {
+		err = io.ErrUnexpectedEOF
+	}
+	return err
+}
+
+func (v *volume) peek(n int) ([]byte, error) {
+	b, err := v.br.Peek(n)
+	if err == io.EOF && len(b) > 0 {
+		err = io.ErrUnexpectedEOF
+	}
+	return b, err
+}
+
+func (v *volume) readSlice(n int) ([]byte, error) {
+	b, err := v.br.Peek(n)
+	if err == nil {
+		n, err = v.br.Discard(n)
+		v.off += int64(n)
+		return b[:n:n], err
+	}
+	if err != bufio.ErrBufferFull {
+		if err == io.EOF && len(b) > 0 {
+			err = io.ErrUnexpectedEOF
+		}
+		return nil, err
+	}
+	// bufio.Reader buffer is too small, create a new slice and copy to it
+	b = make([]byte, n)
+	if _, err = io.ReadFull(v.br, b); err != nil {
+		return nil, err
+	}
+	v.off += int64(n)
+	return b, nil
+}
+
+func (v *volume) Read(p []byte) (int, error) {
+	n, err := v.br.Read(p)
+	v.off += int64(n)
+	return n, err
+}
+
+// findSig searches for the RAR signature and version at the beginning of a file.
+// It searches no more than maxSfxSize bytes.
+func (v *volume) findSig() error {
+	v.off = 0
+	for v.off <= maxSfxSize {
+		b, err := v.br.ReadSlice(sigPrefix[0])
+		v.off += int64(len(b))
+		if err == bufio.ErrBufferFull {
+			continue
+		} else if err != nil {
+			if err == io.EOF {
+				err = ErrNoSig
+			}
+			return err
+		}
+
+		b, err = v.br.Peek(len(sigPrefix[1:]) + 2)
+		if err != nil {
+			if err == io.EOF {
+				err = ErrNoSig
+			}
+			return err
+		}
+		if !bytes.HasPrefix(b, []byte(sigPrefix[1:])) {
+			continue
+		}
+		b = b[len(sigPrefix)-1:]
+
+		ver := int(b[0])
+		if b[0] != 0 && b[1] != 0 {
+			continue
+		}
+		b, err = v.br.ReadSlice('\x00')
+		v.off += int64(len(b))
+		if v.num == 0 {
+			v.ver = ver
+		} else if v.ver != ver {
+			return ErrVerMismatch
+		}
+		return err
+	}
+	return ErrNoSig
+}
+
+func nextNewVolName(file string) string {
+	var inDigit bool
+	var m []int
+	for i, c := range file {
+		if c >= '0' && c <= '9' {
+			if !inDigit {
+				m = append(m, i)
+				inDigit = true
+			}
+		} else if inDigit {
+			m = append(m, i)
+			inDigit = false
+		}
+	}
+	if inDigit {
+		m = append(m, len(file))
+	}
+	if l := len(m); l >= 4 {
+		// More than 1 match so assume name.part###of###.rar style.
+		// Take the last 2 matches where the first is the volume number.
+		m = m[l-4 : l]
+		if strings.Contains(file[m[1]:m[2]], ".") || !strings.Contains(file[:m[0]], ".") {
+			// Didn't match above style as volume had '.' between the two numbers or didnt have a '.'
+			// before the first match. Use the second number as volume number.
+			m = m[2:]
+		}
+	}
+	// extract and increment volume number
+	lo, hi := m[0], m[1]
+	n, err := strconv.Atoi(file[lo:hi])
+	if err != nil {
+		n = 0
+	} else {
+		n++
+	}
+	// volume number must use at least the same number of characters as previous volume
+	vol := fmt.Sprintf("%0"+fmt.Sprint(hi-lo)+"d", n)
+	return file[:lo] + vol + file[hi:]
+}
+
+func nextOldVolName(file string) string {
+	// old style volume naming
+	i := strings.LastIndex(file, ".")
+	// get file extension
+	b := []byte(file[i+1:])
+
+	// If 2nd and 3rd character of file extension is not a digit replace
+	// with "00" and ignore any trailing characters.
+	if len(b) < 3 || b[1] < '0' || b[1] > '9' || b[2] < '0' || b[2] > '9' {
+		return file[:i+2] + "00"
+	}
+
+	// start incrementing volume number digits from rightmost
+	for j := 2; j >= 0; j-- {
+		if b[j] != '9' {
+			b[j]++
+			break
+		}
+		// digit overflow
+		if j == 0 {
+			// last character before '.'
+			b[j] = 'A'
+		} else {
+			// set to '0' and loop to next character
+			b[j] = '0'
+		}
+	}
+	return file[:i+1] + string(b)
+}
+
+func hasDigits(s string) bool {
+	for _, c := range s {
+		if c >= '0' && c <= '9' {
+			return true
+		}
+	}
+	return false
+}
+
+// openNextFile opens the next volume file in the archive.
+func (v *volume) openNextFile() error {
+	file := v.file
+	if v.num == 0 {
+		// check file extensions
+		i := strings.LastIndex(file, ".")
+		if i < 0 {
+			// no file extension, add one
+			file += ".rar"
+		} else {
+			ext := strings.ToLower(file[i+1:])
+			// replace with .rar for empty extensions & self extracting archives
+			if ext == "" || ext == "exe" || ext == "sfx" {
+				file = file[:i+1] + "rar"
+			}
+		}
+		// new naming scheme must have volume number in filename
+		if !v.old {
+			if hasDigits(file) {
+				// found digits, try using new naming scheme
+				err := v.openFile(nextNewVolName(file))
+				if err != nil && os.IsNotExist(err) {
+					// file didn't exist, try old naming scheme
+					oldErr := v.openFile(nextOldVolName(file))
+					if oldErr == nil || !os.IsNotExist(err) {
+						v.old = true
+						return oldErr
+					}
+				}
+				return err
+			}
+			// no digits in filename, use old naming
+			v.old = true
+		}
+	}
+	if v.old {
+		file = nextOldVolName(file)
+	} else {
+		file = nextNewVolName(file)
+	}
+	return v.openFile(file)
+}
+
+func (v *volume) next() error {
+	if len(v.file) == 0 {
+		return ErrFileNameRequired
+	}
+	err := v.Close()
+	if err != nil {
+		return err
+	}
+	v.f = nil
+	err = v.openNextFile() // Open next volume file
+	v.num++
+	if err != nil {
+		return err
+	}
+	err = v.findSig()
+	if err != nil {
+		_ = v.Close()
+	}
+	return err
+}
+
+func newVolume(r io.Reader, opts []Option) (*volume, error) {
+	v := &volume{f: r}
+	v.setOpts(opts)
+	v.setBuffer()
+	return v, v.findSig()
+}
+
+func openVolume(name string, opts []Option) (*volume, error) {
+	v := &volume{}
+	v.dir, v.file = filepath.Split(name)
+	v.setOpts(opts)
+	err := v.openFile(v.file)
+	if err != nil {
+		return nil, err
+	}
+	err = v.findSig()
+	if err != nil {
+		_ = v.Close()
+		return nil, err
+	}
+	return v, nil
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/README.md b/vendor/github.com/pierrec/lz4/v4/README.md
index df027e2c30..dee77545b0 100644
--- a/vendor/github.com/pierrec/lz4/v4/README.md
+++ b/vendor/github.com/pierrec/lz4/v4/README.md
@@ -21,7 +21,7 @@ go get github.com/pierrec/lz4/v4
 There is a command line interface tool to compress and decompress LZ4 files.
 
 ```
-go install github.com/pierrec/lz4/v4/cmd/lz4c
+go install github.com/pierrec/lz4/v4/cmd/lz4c@latest
 ```
 
 Usage
@@ -87,4 +87,6 @@ Thanks to all [contributors](https://github.com/pierrec/lz4/graphs/contributors)
 
 Special thanks to [@Zariel](https://github.com/Zariel) for his asm implementation of the decoder.
 
+Special thanks to [@greatroar](https://github.com/greatroar) for his work on the asm implementations of the decoder for amd64 and arm64.
+
 Special thanks to [@klauspost](https://github.com/klauspost) for his work on optimizing the code.
diff --git a/vendor/github.com/pierrec/lz4/v4/compressing_reader.go b/vendor/github.com/pierrec/lz4/v4/compressing_reader.go
new file mode 100644
index 0000000000..8df0dc76d0
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/compressing_reader.go
@@ -0,0 +1,222 @@
+package lz4
+
+import (
+	"errors"
+	"io"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"github.com/pierrec/lz4/v4/internal/lz4stream"
+)
+
+type crState int
+
+const (
+	crStateInitial crState = iota
+	crStateReading 
+	crStateFlushing
+	crStateDone
+)
+
+type CompressingReader struct {
+	state crState
+	src io.ReadCloser // source reader
+	level lz4block.CompressionLevel // how hard to try
+	frame *lz4stream.Frame // frame being built
+	in []byte
+	out ovWriter
+	handler func(int)
+}
+
+// NewCompressingReader creates a reader which reads compressed data from
+// raw stream. This makes it a logical opposite of a normal lz4.Reader.
+// We require an io.ReadCloser as an underlying source for compatibility
+// with Go's http.Request.
+func NewCompressingReader(src io.ReadCloser) *CompressingReader {
+	zrd := &CompressingReader {
+		frame: lz4stream.NewFrame(),
+	}
+
+	_ = zrd.Apply(DefaultBlockSizeOption, DefaultChecksumOption, defaultOnBlockDone)
+	zrd.Reset(src)
+
+	return zrd
+}
+
+// Source exposes the underlying source stream for introspection and control.
+func (zrd *CompressingReader) Source() io.ReadCloser {
+	return zrd.src
+}
+
+// Close simply invokes the underlying stream Close method. This method is
+// provided for the benefit of Go http client/server, which relies on Close
+// for goroutine termination.
+func (zrd *CompressingReader) Close() error {
+	return zrd.src.Close()
+}
+
+// Apply applies useful options to the lz4 encoder.
+func (zrd *CompressingReader) Apply(options ...Option) (err error) {
+	if zrd.state != crStateInitial {
+		return lz4errors.ErrOptionClosedOrError
+	}
+
+	zrd.Reset(zrd.src)
+
+	for _, o := range options {
+		if err = o(zrd); err != nil {
+			return
+		}
+	}
+	return
+}
+
+func (*CompressingReader) private() {}
+
+func (zrd *CompressingReader) init() error {
+	zrd.frame.InitW(&zrd.out, 1, false)
+	size := zrd.frame.Descriptor.Flags.BlockSizeIndex()
+	zrd.in = size.Get()
+	return zrd.frame.Descriptor.Write(zrd.frame, &zrd.out)
+}
+
+// Read allows reading of lz4 compressed data
+func (zrd *CompressingReader) Read(p []byte) (n int, err error) {
+	defer func() {
+		if err != nil {
+			zrd.state = crStateDone
+		}
+	}()
+
+	if !zrd.out.reset(p) {
+		return len(p), nil
+	}
+
+	switch zrd.state {
+	case crStateInitial:
+		err = zrd.init()
+		if err != nil {
+			return
+		}
+		zrd.state = crStateReading
+	case crStateDone:
+		return 0, errors.New("This reader is done")
+	case crStateFlushing:
+		if zrd.out.dataPos > 0 {
+			n = zrd.out.dataPos
+			zrd.out.data = nil
+			zrd.out.dataPos = 0
+			return
+		} else {
+			zrd.state = crStateDone
+			return 0, io.EOF
+		}
+	}
+
+	for zrd.state == crStateReading {
+		block := zrd.frame.Blocks.Block
+
+		var rCount int
+		rCount, err = io.ReadFull(zrd.src, zrd.in)
+		switch err {
+		case nil:
+			err = block.Compress(
+				zrd.frame, zrd.in[ : rCount], zrd.level,
+			).Write(zrd.frame, &zrd.out)
+			zrd.handler(len(block.Data))
+			if err != nil {
+				return
+			}
+
+			if zrd.out.dataPos == len(zrd.out.data) {
+				n = zrd.out.dataPos
+				zrd.out.dataPos = 0
+				zrd.out.data = nil
+				return
+			}
+		case io.EOF, io.ErrUnexpectedEOF: // read may be partial
+			if rCount > 0 {
+				err = block.Compress(
+					zrd.frame, zrd.in[ : rCount], zrd.level,
+				).Write(zrd.frame, &zrd.out)
+				zrd.handler(len(block.Data))
+				if err != nil {
+					return
+				}
+			}
+
+			err = zrd.frame.CloseW(&zrd.out, 1)
+			if err != nil {
+				return
+			}
+			zrd.state = crStateFlushing
+
+			n = zrd.out.dataPos
+			zrd.out.dataPos = 0
+			zrd.out.data = nil
+			return
+		default:
+			return
+		}
+	}
+
+	err = lz4errors.ErrInternalUnhandledState
+	return
+}
+
+// Reset makes the stream usable again; mostly handy to reuse lz4 encoder
+// instances.
+func (zrd *CompressingReader) Reset(src io.ReadCloser) {
+	zrd.frame.Reset(1)
+	zrd.state = crStateInitial
+	zrd.src = src
+	zrd.out.clear()
+}
+
+type ovWriter struct {
+	data []byte
+	ov []byte
+	dataPos int
+	ovPos int
+}
+
+func (wr *ovWriter) Write(p []byte) (n int, err error) {
+	count := copy(wr.data[wr.dataPos : ], p)
+	wr.dataPos += count
+
+	if count < len(p) {
+		wr.ov = append(wr.ov, p[count : ]...)
+	}
+
+	return len(p), nil
+}
+
+func (wr *ovWriter) reset(out []byte) bool {
+	ovRem := len(wr.ov) - wr.ovPos
+
+	if ovRem >= len(out) {
+		wr.ovPos += copy(out, wr.ov[wr.ovPos : ])
+		return false
+	}
+
+	if ovRem > 0 {
+		copy(out, wr.ov[wr.ovPos : ])
+		wr.ov = wr.ov[ : 0]
+		wr.ovPos = 0
+		wr.dataPos = ovRem
+	} else if wr.ovPos > 0 {
+		wr.ov = wr.ov[ : 0]
+		wr.ovPos = 0
+		wr.dataPos = 0
+	}
+
+	wr.data = out
+	return true
+}
+
+func (wr *ovWriter) clear() {
+	wr.data = nil
+	wr.dataPos = 0
+	wr.ov = wr.ov[ : 0]
+	wr.ovPos = 0
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go
index f382649430..fec8adb03a 100644
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go
@@ -41,11 +41,11 @@ func CompressBlockBound(n int) int {
 	return n + n/255 + 16
 }
 
-func UncompressBlock(src, dst []byte) (int, error) {
+func UncompressBlock(src, dst, dict []byte) (int, error) {
 	if len(src) == 0 {
 		return 0, nil
 	}
-	if di := decodeBlock(dst, src); di >= 0 {
+	if di := decodeBlock(dst, src, dict); di >= 0 {
 		return di, nil
 	}
 	return 0, lz4errors.ErrInvalidSourceShortBuffer
@@ -62,7 +62,10 @@ type Compressor struct {
 	// inspecting the input stream.
 	table [htSize]uint16
 
-	needsReset bool
+	// Bitmap indicating which positions in the table are in use.
+	// This allows us to quickly reset the table for reuse,
+	// without having to zero everything.
+	inUse [htSize / 32]uint32
 }
 
 // Get returns the position of a presumptive match for the hash h.
@@ -70,7 +73,10 @@ type Compressor struct {
 // If si < winSize, the return value may be negative.
 func (c *Compressor) get(h uint32, si int) int {
 	h &= htSize - 1
-	i := int(c.table[h])
+	i := 0
+	if c.inUse[h/32]&(1<<(h%32)) != 0 {
+		i = int(c.table[h])
+	}
 	i += si &^ winMask
 	if i >= si {
 		// Try previous 64kiB block (negative when in first block).
@@ -82,8 +88,11 @@ func (c *Compressor) get(h uint32, si int) int {
 func (c *Compressor) put(h uint32, si int) {
 	h &= htSize - 1
 	c.table[h] = uint16(si)
+	c.inUse[h/32] |= 1 << (h % 32)
 }
 
+func (c *Compressor) reset() { c.inUse = [htSize / 32]uint32{} }
+
 var compressorPool = sync.Pool{New: func() interface{} { return new(Compressor) }}
 
 func CompressBlock(src, dst []byte) (int, error) {
@@ -94,11 +103,8 @@ func CompressBlock(src, dst []byte) (int, error) {
 }
 
 func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
-	if c.needsReset {
-		// Zero out reused table to avoid non-deterministic output (issue #65).
-		c.table = [htSize]uint16{}
-	}
-	c.needsReset = true // Only false on first call.
+	// Zero out reused table to avoid non-deterministic output (issue #65).
+	c.reset()
 
 	// Return 0, nil only if the destination buffer size is < CompressBlockBound.
 	isNotCompressible := len(dst) < CompressBlockBound(len(src))
@@ -126,7 +132,7 @@ func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
 		// We check a match at s, s+1 and s+2 and pick the first one we get.
 		// Checking 3 only requires us to load the source one.
 		ref := c.get(h, si)
-		ref2 := c.get(h2, si)
+		ref2 := c.get(h2, si+1)
 		c.put(h, si)
 		c.put(h2, si+1)
 
@@ -175,7 +181,7 @@ func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
 		si, mLen = si+mLen, si+minMatch
 
 		// Find the longest match by looking by batches of 8 bytes.
-		for si+8 < sn {
+		for si+8 <= sn {
 			x := binary.LittleEndian.Uint64(src[si:]) ^ binary.LittleEndian.Uint64(src[si-offset:])
 			if x == 0 {
 				si += 8
@@ -187,6 +193,9 @@ func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
 		}
 
 		mLen = si - mLen
+		if di >= len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
 		if mLen < 0xF {
 			dst[di] = byte(mLen)
 		} else {
@@ -200,10 +209,13 @@ func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
 			dst[di] |= 0xF0
 			di++
 			l := lLen - 0xF
-			for ; l >= 0xFF; l -= 0xFF {
+			for ; l >= 0xFF && di < len(dst); l -= 0xFF {
 				dst[di] = 0xFF
 				di++
 			}
+			if di >= len(dst) {
+				return 0, lz4errors.ErrInvalidSourceShortBuffer
+			}
 			dst[di] = byte(l)
 		}
 		di++
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go
index e6cf88d71c..138083d947 100644
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go
@@ -8,8 +8,7 @@ const (
 	Block256Kb
 	Block1Mb
 	Block4Mb
-	Block8Mb        = 2 * Block4Mb
-	legacyBlockSize = Block8Mb + Block8Mb/255 + 16 // CompressBound(Block8Mb)
+	Block8Mb = 2 * Block4Mb
 )
 
 var (
@@ -17,7 +16,7 @@ var (
 	BlockPool256K = sync.Pool{New: func() interface{} { return make([]byte, Block256Kb) }}
 	BlockPool1M   = sync.Pool{New: func() interface{} { return make([]byte, Block1Mb) }}
 	BlockPool4M   = sync.Pool{New: func() interface{} { return make([]byte, Block4Mb) }}
-	BlockPool8M   = sync.Pool{New: func() interface{} { return make([]byte, legacyBlockSize) }}
+	BlockPool8M   = sync.Pool{New: func() interface{} { return make([]byte, Block8Mb) }}
 )
 
 func Index(b uint32) BlockSizeIndex {
@@ -78,7 +77,7 @@ func Put(buf []byte) {
 		BlockPool1M.Put(buf[:c])
 	case Block4Mb:
 		BlockPool4M.Put(buf[:c])
-	case legacyBlockSize:
+	case Block8Mb:
 		BlockPool8M.Put(buf[:c])
 	}
 }
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s
index be79faa3fe..1d00133fac 100644
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s
@@ -2,12 +2,13 @@
 // +build gc
 // +build !noasm
 
+#include "go_asm.h"
 #include "textflag.h"
 
 // AX scratch
 // BX scratch
-// CX scratch
-// DX token
+// CX literal and match lengths
+// DX token, match offset
 //
 // DI &dst
 // SI &src
@@ -16,9 +17,11 @@
 // R11 &dst
 // R12 short output end
 // R13 short input end
-// func decodeBlock(dst, src []byte) int
-// using 50 bytes of stack currently
-TEXT ·decodeBlock(SB), NOSPLIT, $64-56
+// R14 &dict
+// R15 len(dict)
+
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOSPLIT, $48-80
 	MOVQ dst_base+0(FP), DI
 	MOVQ DI, R11
 	MOVQ dst_len+8(FP), R8
@@ -30,6 +33,9 @@ TEXT ·decodeBlock(SB), NOSPLIT, $64-56
 	JE   err_corrupt
 	ADDQ SI, R9
 
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+
 	// shortcut ends
 	// short output end
 	MOVQ R8, R12
@@ -38,28 +44,26 @@ TEXT ·decodeBlock(SB), NOSPLIT, $64-56
 	MOVQ R9, R13
 	SUBQ $16, R13
 
-loop:
-	// for si < len(src)
-	CMPQ SI, R9
-	JGE end
+	XORL CX, CX
 
+loop:
 	// token := uint32(src[si])
-	MOVBQZX (SI), DX
+	MOVBLZX (SI), DX
 	INCQ SI
 
 	// lit_len = token >> 4
 	// if lit_len > 0
 	// CX = lit_len
-	MOVQ DX, CX
-	SHRQ $4, CX
+	MOVL DX, CX
+	SHRL $4, CX
 
 	// if lit_len != 0xF
-	CMPQ CX, $0xF
-	JEQ lit_len_loop_pre
+	CMPL CX, $0xF
+	JEQ  lit_len_loop
 	CMPQ DI, R12
-	JGE lit_len_loop_pre
+	JAE  copy_literal
 	CMPQ SI, R13
-	JGE lit_len_loop_pre
+	JAE  copy_literal
 
 	// copy shortcut
 
@@ -78,28 +82,32 @@ loop:
 	ADDQ CX, DI
 	ADDQ CX, SI
 
-	MOVQ DX, CX
-	ANDQ $0xF, CX
+	MOVL DX, CX
+	ANDL $0xF, CX
 
 	// The second stage: prepare for match copying, decode full info.
 	// If it doesn't work out, the info won't be wasted.
 	// offset := uint16(data[:2])
-	MOVWQZX (SI), DX
+	MOVWLZX (SI), DX
+	TESTL DX, DX
+	JE err_corrupt
 	ADDQ $2, SI
+	JC err_short_buf
 
 	MOVQ DI, AX
 	SUBQ DX, AX
+	JC err_corrupt
 	CMPQ AX, DI
-	JGT err_short_buf
+	JA err_short_buf
 
 	// if we can't do the second stage then jump straight to read the
 	// match length, we already have the offset.
-	CMPQ CX, $0xF
+	CMPL CX, $0xF
 	JEQ match_len_loop_pre
-	CMPQ DX, $8
+	CMPL DX, $8
 	JLT match_len_loop_pre
 	CMPQ AX, R11
-	JLT err_short_buf
+	JB match_len_loop_pre
 
 	// memcpy(op + 0, match + 0, 8);
 	MOVQ (AX), BX
@@ -111,68 +119,63 @@ loop:
 	MOVW 16(AX), BX
 	MOVW BX, 16(DI)
 
-	LEAQ 4(DI)(CX*1), DI // minmatch
+	LEAQ const_minMatch(DI)(CX*1), DI
 
 	// shortcut complete, load next token
-	JMP loop
-
-lit_len_loop_pre:
-	// if lit_len > 0
-	CMPQ CX, $0
-	JEQ offset
-	CMPQ CX, $0xF
-	JNE copy_literal
+	JMP loopcheck
 
+	// Read the rest of the literal length:
+	// do { BX = src[si++]; lit_len += BX } while (BX == 0xFF).
 lit_len_loop:
-	// for src[si] == 0xFF
-	CMPB (SI), $0xFF
-	JNE lit_len_finalise
-
-	// bounds check src[si+1]
-	LEAQ 1(SI), AX
-	CMPQ AX, R9
-	JGT err_short_buf
+	CMPQ SI, R9
+	JAE err_short_buf
 
-	// lit_len += 0xFF
-	ADDQ $0xFF, CX
+	MOVBLZX (SI), BX
 	INCQ SI
-	JMP lit_len_loop
+	ADDQ BX, CX
 
-lit_len_finalise:
-	// lit_len += int(src[si])
-	// si++
-	MOVBQZX (SI), AX
-	ADDQ AX, CX
-	INCQ SI
+	CMPB BX, $0xFF
+	JE lit_len_loop
 
 copy_literal:
 	// bounds check src and dst
-	LEAQ (SI)(CX*1), AX
+	MOVQ SI, AX
+	ADDQ CX, AX
+	JC err_short_buf
 	CMPQ AX, R9
-	JGT err_short_buf
+	JA err_short_buf
 
-	LEAQ (DI)(CX*1), AX
-	CMPQ AX, R8
-	JGT err_short_buf
+	MOVQ DI, BX
+	ADDQ CX, BX
+	JC err_short_buf
+	CMPQ BX, R8
+	JA err_short_buf
 
-	// whats a good cut off to call memmove?
-	CMPQ CX, $16
+	// Copy literals of <=48 bytes through the XMM registers.
+	CMPQ CX, $48
 	JGT memmove_lit
 
-	// if len(dst[di:]) < 16
+	// if len(dst[di:]) < 48
 	MOVQ R8, AX
 	SUBQ DI, AX
-	CMPQ AX, $16
+	CMPQ AX, $48
 	JLT memmove_lit
 
-	// if len(src[si:]) < 16
-	MOVQ R9, AX
-	SUBQ SI, AX
-	CMPQ AX, $16
+	// if len(src[si:]) < 48
+	MOVQ R9, BX
+	SUBQ SI, BX
+	CMPQ BX, $48
 	JLT memmove_lit
 
 	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU 32(SI), X2
 	MOVOU X0, (DI)
+	MOVOU X1, 16(DI)
+	MOVOU X2, 32(DI)
+
+	ADDQ CX, SI
+	ADDQ CX, DI
 
 	JMP finish_lit_copy
 
@@ -181,18 +184,20 @@ memmove_lit:
 	MOVQ DI, 0(SP)
 	MOVQ SI, 8(SP)
 	MOVQ CX, 16(SP)
-	// spill
+
+	// Spill registers. Increment SI, DI now so we don't need to save CX.
+	ADDQ CX, DI
+	ADDQ CX, SI
 	MOVQ DI, 24(SP)
 	MOVQ SI, 32(SP)
-	MOVQ CX, 40(SP) // need len to inc SI, DI after
-	MOVB DX, 48(SP)
+	MOVL DX, 40(SP)
+
 	CALL runtime·memmove(SB)
 
 	// restore registers
 	MOVQ 24(SP), DI
 	MOVQ 32(SP), SI
-	MOVQ 40(SP), CX
-	MOVB 48(SP), DX
+	MOVL 40(SP), DX
 
 	// recalc initial values
 	MOVQ dst_base+0(FP), R8
@@ -200,74 +205,62 @@ memmove_lit:
 	ADDQ dst_len+8(FP), R8
 	MOVQ src_base+24(FP), R9
 	ADDQ src_len+32(FP), R9
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
 	MOVQ R8, R12
 	SUBQ $32, R12
 	MOVQ R9, R13
 	SUBQ $16, R13
 
 finish_lit_copy:
-	ADDQ CX, SI
-	ADDQ CX, DI
-
-	CMPQ SI, R9
-	JGE end
-
-offset:
 	// CX := mLen
 	// free up DX to use for offset
-	MOVQ DX, CX
+	MOVL DX, CX
+	ANDL $0xF, CX
 
-	LEAQ 2(SI), AX
-	CMPQ AX, R9
-	JGT err_short_buf
+	CMPQ SI, R9
+	JAE end
 
 	// offset
-	// DX := int(src[si]) | int(src[si+1])<<8
-	MOVWQZX (SI), DX
+	// si += 2
+	// DX := int(src[si-2]) | int(src[si-1])<<8
 	ADDQ $2, SI
+	JC err_short_buf
+	CMPQ SI, R9
+	JA err_short_buf
+	MOVWQZX -2(SI), DX
 
 	// 0 offset is invalid
-	CMPQ DX, $0
-	JEQ err_corrupt
-
-	ANDB $0xF, CX
+	TESTL DX, DX
+	JEQ   err_corrupt
 
 match_len_loop_pre:
 	// if mlen != 0xF
 	CMPB CX, $0xF
 	JNE copy_match
 
+	// do { BX = src[si++]; mlen += BX } while (BX == 0xFF).
 match_len_loop:
-	// for src[si] == 0xFF
-	// lit_len += 0xFF
-	CMPB (SI), $0xFF
-	JNE match_len_finalise
-
-	// bounds check src[si+1]
-	LEAQ 1(SI), AX
-	CMPQ AX, R9
-	JGT err_short_buf
+	CMPQ SI, R9
+	JAE err_short_buf
 
-	ADDQ $0xFF, CX
+	MOVBLZX (SI), BX
 	INCQ SI
-	JMP match_len_loop
+	ADDQ BX, CX
 
-match_len_finalise:
-	// lit_len += int(src[si])
-	// si++
-	MOVBQZX (SI), AX
-	ADDQ AX, CX
-	INCQ SI
+	CMPB BX, $0xFF
+	JE match_len_loop
 
 copy_match:
-	// mLen += minMatch
-	ADDQ $4, CX
+	ADDQ $const_minMatch, CX
 
 	// check we have match_len bytes left in dst
 	// di+match_len < len(dst)
-	LEAQ (DI)(CX*1), AX
+	MOVQ DI, AX
+	ADDQ CX, AX
+	JC err_short_buf
 	CMPQ AX, R8
-	JGT err_short_buf
+	JA err_short_buf
 
 	// DX = offset
 	// CX = match_len
@@ -277,13 +270,14 @@ copy_match:
 
 	// check BX is within dst
 	// if BX < &dst
+	JC copy_match_from_dict
 	CMPQ BX, R11
-	JLT err_short_buf
+	JBE copy_match_from_dict
 
 	// if offset + match_len < di
 	LEAQ (BX)(CX*1), AX
 	CMPQ DI, AX
-	JGT copy_interior_match
+	JA copy_interior_match
 
 	// AX := len(dst[:di])
 	// MOVQ DI, AX
@@ -303,11 +297,9 @@ copy_match_loop:
 	INCQ DI
 	INCQ BX
 	DECQ CX
+	JNZ copy_match_loop
 
-	CMPQ CX, $0
-	JGT copy_match_loop
-
-	JMP loop
+	JMP loopcheck
 
 copy_interior_match:
 	CMPQ CX, $16
@@ -323,23 +315,50 @@ copy_interior_match:
 	MOVOU X0, (DI)
 
 	ADDQ CX, DI
-	JMP loop
+	XORL CX, CX
+	JMP  loopcheck
+
+copy_match_from_dict:
+	// CX = match_len
+	// BX = &dst + (di - offset)
+
+	// AX = offset - di = dict_bytes_available => count of bytes potentially covered by the dictionary
+	MOVQ R11, AX
+	SUBQ BX, AX
+
+	// BX = len(dict) - dict_bytes_available
+	MOVQ R15, BX
+	SUBQ AX, BX
+	JS err_short_dict
+
+	ADDQ R14, BX
+
+	// if match_len > dict_bytes_available, match fits entirely within external dictionary : just copy
+	CMPQ CX, AX
+	JLT memmove_match
+
+	// The match stretches over the dictionary and our block
+	// 1) copy what comes from the dictionary
+	// AX = dict_bytes_available = copy_size
+	// BX = &dict_end - copy_size
+	// CX = match_len
 
-memmove_match:
 	// memmove(to, from, len)
 	MOVQ DI, 0(SP)
 	MOVQ BX, 8(SP)
-	MOVQ CX, 16(SP)
+	MOVQ AX, 16(SP)
+	// store extra stuff we want to recover
 	// spill
 	MOVQ DI, 24(SP)
 	MOVQ SI, 32(SP)
-	MOVQ CX, 40(SP) // need len to inc SI, DI after
+	MOVQ CX, 40(SP)
 	CALL runtime·memmove(SB)
 
 	// restore registers
+	MOVQ 16(SP), AX // copy_size
 	MOVQ 24(SP), DI
 	MOVQ 32(SP), SI
-	MOVQ 40(SP), CX
+	MOVQ 40(SP), CX // match_len
 
 	// recalc initial values
 	MOVQ dst_base+0(FP), R8
@@ -347,23 +366,83 @@ memmove_match:
 	ADDQ dst_len+8(FP), R8
 	MOVQ src_base+24(FP), R9
 	ADDQ src_len+32(FP), R9
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
 	MOVQ R8, R12
 	SUBQ $32, R12
 	MOVQ R9, R13
 	SUBQ $16, R13
 
+	// di+=copy_size
+	ADDQ AX, DI
+
+	// 2) copy the rest from the current block
+	// CX = match_len - copy_size = rest_size
+	SUBQ AX, CX
+	MOVQ R11, BX
+
+	// check if we have a copy overlap
+	// AX = &dst + rest_size
+	MOVQ CX, AX
+	ADDQ BX, AX
+	// if &dst + rest_size > di, copy byte by byte
+	CMPQ AX, DI
+
+	JA copy_match_loop
+
+memmove_match:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ BX, 8(SP)
+	MOVQ CX, 16(SP)
+
+	// Spill registers. Increment DI now so we don't need to save CX.
 	ADDQ CX, DI
-	JMP loop
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11 // TODO: make these sensible numbers
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	XORL CX, CX
+
+loopcheck:
+	// for si < len(src)
+	CMPQ SI, R9
+	JB   loop
+
+end:
+	// Remaining length must be zero.
+	TESTQ CX, CX
+	JNE   err_corrupt
+
+	SUBQ R11, DI
+	MOVQ DI, ret+72(FP)
+	RET
 
 err_corrupt:
-	MOVQ $-1, ret+48(FP)
+	MOVQ $-1, ret+72(FP)
 	RET
 
 err_short_buf:
-	MOVQ $-2, ret+48(FP)
+	MOVQ $-2, ret+72(FP)
 	RET
 
-end:
-	SUBQ R11, DI
-	MOVQ DI, ret+48(FP)
+err_short_dict:
+	MOVQ $-3, ret+72(FP)
 	RET
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s
index 64be9adcaa..20b21fcf15 100644
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s
@@ -1,6 +1,7 @@
 // +build gc
 // +build !noasm
 
+#include "go_asm.h"
 #include "textflag.h"
 
 // Register allocation.
@@ -10,21 +11,20 @@
 #define dstend	R3
 #define srcend	R4
 #define match	R5	// Match address.
-#define token	R6
-#define len	R7	// Literal and match lengths.
-#define offset	R6	// Match offset; overlaps with token.
-#define tmp1	R8
-#define tmp2	R9
+#define dictend	R6
+#define token	R7
+#define len	R8	// Literal and match lengths.
+#define offset	R7	// Match offset; overlaps with token.
+#define tmp1	R9
+#define tmp2	R11
 #define tmp3	R12
 
-#define minMatch	$4
-
-// func decodeBlock(dst, src []byte) int
-TEXT ·decodeBlock(SB), NOFRAME|NOSPLIT, $-4-28
-	MOVW dst_base +0(FP), dst
-	MOVW dst_len  +4(FP), dstend
-	MOVW src_base+12(FP), src
-	MOVW src_len +16(FP), srcend
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $-4-40
+	MOVW dst_base  +0(FP), dst
+	MOVW dst_len   +4(FP), dstend
+	MOVW src_base +12(FP), src
+	MOVW src_len  +16(FP), srcend
 
 	CMP $0, srcend
 	BEQ shortSrc
@@ -45,7 +45,8 @@ readLitlenLoop:
 	CMP     src, srcend
 	BEQ     shortSrc
 	MOVBU.P 1(src), tmp1
-	ADD     tmp1, len
+	ADD.S   tmp1, len
+	BVS     shortDst
 	CMP     $255, tmp1
 	BEQ     readLitlenLoop
 
@@ -54,12 +55,13 @@ readLitlenDone:
 	BEQ copyLiteralDone
 
 	// Bounds check dst+len and src+len.
-	ADD    dst, len, tmp1
-	CMP    dstend, tmp1
-	//BHI  shortDst	// Uncomment for distinct error codes.
-	ADD    src, len, tmp2
-	CMP.LS srcend, tmp2
-	BHI    shortSrc
+	ADD.S    dst, len, tmp1
+	ADD.CC.S src, len, tmp2
+	BCS      shortSrc
+	CMP      dstend, tmp1
+	//BHI    shortDst // Uncomment for distinct error codes.
+	CMP.LS   srcend, tmp2
+	BHI      shortSrc
 
 	// Copy literal.
 	CMP $4, len
@@ -95,36 +97,34 @@ copyLiteralLoopCond:
 	SUB.S  $4, len
 	BPL    copyLiteralLoop
 
-	// Restore len, which is now negative.
-	ADD $4, len
-
 copyLiteralFinish:
 	// Copy remaining 0-3 bytes.
-	TST        $2, len
-	MOVHU.NE.P 2(src), tmp2
-	MOVB.NE.P  tmp2, 1(dst)
-	MOVW.NE    tmp2 >> 8, tmp1
-	MOVB.NE.P  tmp1, 1(dst)
-	TST        $1, len
-	MOVBU.NE.P 1(src), tmp1
-	MOVB.NE.P  tmp1, 1(dst)
+	// At this point, len may be < 0, but len&3 is still accurate.
+	TST       $1, len
+	MOVB.NE.P 1(src), tmp3
+	MOVB.NE.P tmp3, 1(dst)
+	TST       $2, len
+	MOVB.NE.P 2(src), tmp1
+	MOVB.NE.P tmp1, 2(dst)
+	MOVB.NE   -1(src), tmp2
+	MOVB.NE   tmp2, -1(dst)
 
 copyLiteralDone:
-	CMP src, srcend
-	BEQ end
-
 	// Initial part of match length.
 	// This frees up the token register for reuse as offset.
 	AND $15, token, len
 
+	CMP src, srcend
+	BEQ end
+
 	// Read offset.
-	ADD   $2, src
+	ADD.S $2, src
+	BCS   shortSrc
 	CMP   srcend, src
 	BHI   shortSrc
 	MOVBU -2(src), offset
 	MOVBU -1(src), tmp1
-	ORR   tmp1 << 8, offset
-	CMP   $0, offset
+	ORR.S tmp1 << 8, offset
 	BEQ   corrupt
 
 	// Read rest of match length.
@@ -135,20 +135,51 @@ readMatchlenLoop:
 	CMP     src, srcend
 	BEQ     shortSrc
 	MOVBU.P 1(src), tmp1
-	ADD     tmp1, len
+	ADD.S   tmp1, len
+	BVS     shortDst
 	CMP     $255, tmp1
 	BEQ     readMatchlenLoop
 
 readMatchlenDone:
-	// Bounds check dst+len+minMatch and match = dst-offset.
-	ADD    dst, len, tmp1
-	ADD    minMatch, tmp1
-	CMP    dstend, tmp1
-	//BHI  shortDst	// Uncomment for distinct error codes.
-	SUB    offset, dst, match
-	CMP.LS match, dstorig
-	BHI    corrupt
+	// Bounds check dst+len+minMatch.
+	ADD.S    dst, len, tmp1
+	ADD.CC.S $const_minMatch, tmp1
+	BCS      shortDst
+	CMP      dstend, tmp1
+	BHI      shortDst
+
+	RSB dst, offset, match
+	CMP dstorig, match
+	BGE copyMatch4
+
+	// match < dstorig means the match starts in the dictionary,
+	// at len(dict) - offset + (dst - dstorig).
+	MOVW dict_base+24(FP), match
+	MOVW dict_len +28(FP), dictend
+
+	ADD $const_minMatch, len
+
+	RSB   dst, dstorig, tmp1
+	RSB   dictend, offset, tmp2
+	ADD.S tmp2, tmp1
+	BMI   shortDict
+	ADD   match, dictend
+	ADD   tmp1, match
+
+copyDict:
+	MOVBU.P 1(match), tmp1
+	MOVB.P  tmp1, 1(dst)
+	SUB.S   $1, len
+	CMP.NE  match, dictend
+	BNE     copyDict
+
+	// If the match extends beyond the dictionary, the rest is at dstorig.
+	CMP  $0, len
+	BEQ  copyMatchDone
+	MOVW dstorig, match
+	B    copyMatch
 
+	// Copy a regular match.
 	// Since len+minMatch is at least four, we can do a 4× unrolled
 	// byte copy loop. Using MOVW instead of four byte loads is faster,
 	// but to remain portable we'd have to align match first, which is
@@ -182,16 +213,19 @@ copyMatchDone:
 	BNE loop
 
 end:
+	CMP  $0, len
+	BNE  corrupt
 	SUB  dstorig, dst, tmp1
-	MOVW tmp1, ret+24(FP)
+	MOVW tmp1, ret+36(FP)
 	RET
 
-	// The three error cases have distinct labels so we can put different
+	// The error cases have distinct labels so we can put different
 	// return codes here when debugging, or if the error returns need to
 	// be changed.
+shortDict:
 shortDst:
 shortSrc:
 corrupt:
 	MOVW $-1, tmp1
-	MOVW tmp1, ret+24(FP)
+	MOVW tmp1, ret+36(FP)
 	RET
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm64.s b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm64.s
new file mode 100644
index 0000000000..d2fe11b8ea
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm64.s
@@ -0,0 +1,241 @@
+// +build gc
+// +build !noasm
+
+// This implementation assumes that strict alignment checking is turned off.
+// The Go compiler makes the same assumption.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Register allocation.
+#define dst		R0
+#define dstorig		R1
+#define src		R2
+#define dstend		R3
+#define dstend16	R4	// dstend - 16
+#define srcend		R5
+#define srcend16	R6	// srcend - 16
+#define match		R7	// Match address.
+#define dict		R8
+#define dictlen		R9
+#define dictend		R10
+#define token		R11
+#define len		R12	// Literal and match lengths.
+#define lenRem		R13
+#define offset		R14	// Match offset.
+#define tmp1		R15
+#define tmp2		R16
+#define tmp3		R17
+#define tmp4		R19
+
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $0-80
+	LDP  dst_base+0(FP), (dst, dstend)
+	ADD  dst, dstend
+	MOVD dst, dstorig
+
+	LDP src_base+24(FP), (src, srcend)
+	CBZ srcend, shortSrc
+	ADD src, srcend
+
+	// dstend16 = max(dstend-16, 0) and similarly for srcend16.
+	SUBS $16, dstend, dstend16
+	CSEL LO, ZR, dstend16, dstend16
+	SUBS $16, srcend, srcend16
+	CSEL LO, ZR, srcend16, srcend16
+
+	LDP dict_base+48(FP), (dict, dictlen)
+	ADD dict, dictlen, dictend
+
+loop:
+	// Read token. Extract literal length.
+	MOVBU.P 1(src), token
+	LSR     $4, token, len
+	CMP     $15, len
+	BNE     readLitlenDone
+
+readLitlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADDS    tmp1, len
+	BVS     shortDst
+	CMP     $255, tmp1
+	BEQ     readLitlenLoop
+
+readLitlenDone:
+	CBZ len, copyLiteralDone
+
+	// Bounds check dst+len and src+len.
+	ADDS dst, len, tmp1
+	BCS  shortSrc
+	ADDS src, len, tmp2
+	BCS  shortSrc
+	CMP  dstend, tmp1
+	BHI  shortDst
+	CMP  srcend, tmp2
+	BHI  shortSrc
+
+	// Copy literal.
+	SUBS $16, len
+	BLO  copyLiteralShort
+
+copyLiteralLoop:
+	LDP.P 16(src), (tmp1, tmp2)
+	STP.P (tmp1, tmp2), 16(dst)
+	SUBS  $16, len
+	BPL   copyLiteralLoop
+
+	// Copy (final part of) literal of length 0-15.
+	// If we have >=16 bytes left in src and dst, just copy 16 bytes.
+copyLiteralShort:
+	CMP  dstend16, dst
+	CCMP LO, src, srcend16, $0b0010 // 0010 = preserve carry (LO).
+	BHS  copyLiteralShortEnd
+
+	AND $15, len
+
+	LDP (src), (tmp1, tmp2)
+	ADD len, src
+	STP (tmp1, tmp2), (dst)
+	ADD len, dst
+
+	B copyLiteralDone
+
+	// Safe but slow copy near the end of src, dst.
+copyLiteralShortEnd:
+	TBZ     $3, len, 3(PC)
+	MOVD.P  8(src), tmp1
+	MOVD.P  tmp1, 8(dst)
+	TBZ     $2, len, 3(PC)
+	MOVW.P  4(src), tmp2
+	MOVW.P  tmp2, 4(dst)
+	TBZ     $1, len, 3(PC)
+	MOVH.P  2(src), tmp3
+	MOVH.P  tmp3, 2(dst)
+	TBZ     $0, len, 3(PC)
+	MOVBU.P 1(src), tmp4
+	MOVB.P  tmp4, 1(dst)
+
+copyLiteralDone:
+	// Initial part of match length.
+	AND $15, token, len
+
+	CMP src, srcend
+	BEQ end
+
+	// Read offset.
+	ADDS  $2, src
+	BCS   shortSrc
+	CMP   srcend, src
+	BHI   shortSrc
+	MOVHU -2(src), offset
+	CBZ   offset, corrupt
+
+	// Read rest of match length.
+	CMP $15, len
+	BNE readMatchlenDone
+
+readMatchlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADDS    tmp1, len
+	BVS     shortDst
+	CMP     $255, tmp1
+	BEQ     readMatchlenLoop
+
+readMatchlenDone:
+	ADD $const_minMatch, len
+
+	// Bounds check dst+len.
+	ADDS dst, len, tmp2
+	BCS  shortDst
+	CMP  dstend, tmp2
+	BHI  shortDst
+
+	SUB offset, dst, match
+	CMP dstorig, match
+	BHS copyMatchTry8
+
+	// match < dstorig means the match starts in the dictionary,
+	// at len(dict) - offset + (dst - dstorig).
+	SUB  dstorig, dst, tmp1
+	SUB  offset, dictlen, tmp2
+	ADDS tmp2, tmp1
+	BMI  shortDict
+	ADD  dict, tmp1, match
+
+copyDict:
+	MOVBU.P 1(match), tmp3
+	MOVB.P  tmp3, 1(dst)
+	SUBS    $1, len
+	CCMP    NE, dictend, match, $0b0100 // 0100 sets the Z (EQ) flag.
+	BNE     copyDict
+
+	CBZ len, copyMatchDone
+
+	// If the match extends beyond the dictionary, the rest is at dstorig.
+	// Recompute the offset for the next check.
+	MOVD dstorig, match
+	SUB  dstorig, dst, offset
+
+copyMatchTry8:
+	// Copy doublewords if both len and offset are at least eight.
+	// A 16-at-a-time loop doesn't provide a further speedup.
+	CMP  $8, len
+	CCMP HS, offset, $8, $0
+	BLO  copyMatchTry4
+
+	AND    $7, len, lenRem
+	SUB    $8, len
+copyMatchLoop8:
+	MOVD.P 8(match), tmp1
+	MOVD.P tmp1, 8(dst)
+	SUBS   $8, len
+	BPL    copyMatchLoop8
+
+	MOVD (match)(len), tmp2 // match+len == match+lenRem-8.
+	ADD  lenRem, dst
+	MOVD $0, len
+	MOVD tmp2, -8(dst)
+	B    copyMatchDone
+
+copyMatchTry4:
+	// Copy words if both len and offset are at least four.
+	CMP  $4, len
+	CCMP HS, offset, $4, $0
+	BLO  copyMatchLoop1
+
+	MOVWU.P 4(match), tmp2
+	MOVWU.P tmp2, 4(dst)
+	SUBS    $4, len
+	BEQ     copyMatchDone
+
+copyMatchLoop1:
+	// Byte-at-a-time copy for small offsets <= 3.
+	MOVBU.P 1(match), tmp2
+	MOVB.P  tmp2, 1(dst)
+	SUBS    $1, len
+	BNE     copyMatchLoop1
+
+copyMatchDone:
+	CMP src, srcend
+	BNE loop
+
+end:
+	CBNZ len, corrupt
+	SUB  dstorig, dst, tmp1
+	MOVD tmp1, ret+72(FP)
+	RET
+
+	// The error cases have distinct labels so we can put different
+	// return codes here when debugging, or if the error returns need to
+	// be changed.
+shortDict:
+shortDst:
+shortSrc:
+corrupt:
+	MOVD $-1, tmp1
+	MOVD tmp1, ret+72(FP)
+	RET
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go
index e26f8cd613..8d9023d100 100644
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go
@@ -1,4 +1,5 @@
-// +build amd64 arm
+//go:build (amd64 || arm || arm64) && !appengine && gc && !noasm
+// +build amd64 arm arm64
 // +build !appengine
 // +build gc
 // +build !noasm
@@ -6,4 +7,4 @@
 package lz4block
 
 //go:noescape
-func decodeBlock(dst, src []byte) int
+func decodeBlock(dst, src, dict []byte) int
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go
index 52df2f2b8e..9f568fbb1a 100644
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go
@@ -1,15 +1,23 @@
-// +build !amd64,!arm appengine !gc noasm
+//go:build (!amd64 && !arm && !arm64) || appengine || !gc || noasm
+// +build !amd64,!arm,!arm64 appengine !gc noasm
 
 package lz4block
 
-import "encoding/binary"
+import (
+	"encoding/binary"
+)
 
-func decodeBlock(dst, src []byte) (ret int) {
+func decodeBlock(dst, src, dict []byte) (ret int) {
 	// Restrict capacities so we don't read or write out of bounds.
 	dst = dst[:len(dst):len(dst)]
 	src = src[:len(src):len(src)]
 
 	const hasError = -2
+
+	if len(src) == 0 {
+		return hasError
+	}
+
 	defer func() {
 		if recover() != nil {
 			ret = hasError
@@ -17,7 +25,7 @@ func decodeBlock(dst, src []byte) (ret int) {
 	}()
 
 	var si, di uint
-	for {
+	for si < uint(len(src)) {
 		// Literals and match lengths (token).
 		b := uint(src[si])
 		si++
@@ -38,27 +46,29 @@ func decodeBlock(dst, src []byte) (ret int) {
 					// if the match length (4..18) fits within the literals, then copy
 					// all 18 bytes, even if not all are part of the literals.
 					mLen += 4
-					if offset := u16(src[si:]); mLen <= offset {
+					if offset := u16(src[si:]); mLen <= offset && offset < di {
 						i := di - offset
-						end := i + 18
-						if end > uint(len(dst)) {
-							// The remaining buffer may not hold 18 bytes.
-							// See https://github.com/pierrec/lz4/issues/51.
-							end = uint(len(dst))
+						// The remaining buffer may not hold 18 bytes.
+						// See https://github.com/pierrec/lz4/issues/51.
+						if end := i + 18; end <= uint(len(dst)) {
+							copy(dst[di:], dst[i:end])
+							si += 2
+							di += mLen
+							continue
 						}
-						copy(dst[di:], dst[i:end])
-						si += 2
-						di += mLen
-						continue
 					}
 				}
 			case lLen == 0xF:
-				for src[si] == 0xFF {
-					lLen += 0xFF
+				for {
+					x := uint(src[si])
+					if lLen += x; int(lLen) < 0 {
+						return hasError
+					}
 					si++
+					if x != 0xFF {
+						break
+					}
 				}
-				lLen += uint(src[si])
-				si++
 				fallthrough
 			default:
 				copy(dst[di:di+lLen], src[si:si+lLen])
@@ -66,9 +76,11 @@ func decodeBlock(dst, src []byte) (ret int) {
 				di += lLen
 			}
 		}
-		if si == uint(len(src)) {
-			return int(di)
-		} else if si > uint(len(src)) {
+
+		mLen := b & 0xF
+		if si == uint(len(src)) && mLen == 0 {
+			break
+		} else if si >= uint(len(src)) {
 			return hasError
 		}
 
@@ -79,18 +91,35 @@ func decodeBlock(dst, src []byte) (ret int) {
 		si += 2
 
 		// Match.
-		mLen := b & 0xF
-		if mLen == 0xF {
-			for src[si] == 0xFF {
-				mLen += 0xFF
+		mLen += minMatch
+		if mLen == minMatch+0xF {
+			for {
+				x := uint(src[si])
+				if mLen += x; int(mLen) < 0 {
+					return hasError
+				}
 				si++
+				if x != 0xFF {
+					break
+				}
 			}
-			mLen += uint(src[si])
-			si++
 		}
-		mLen += minMatch
 
 		// Copy the match.
+		if di < offset {
+			// The match is beyond our block, meaning the first part
+			// is in the dictionary.
+			fromDict := dict[uint(len(dict))+di-offset:]
+			n := uint(copy(dst[di:di+mLen], fromDict))
+			di += n
+			if mLen -= n; mLen == 0 {
+				continue
+			}
+			// We copied n = offset-di bytes from the dictionary,
+			// then set di = di+n = offset, so the following code
+			// copies from dst[di-offset:] = dst[0:].
+		}
+
 		expanded := dst[di-offset:]
 		if mLen > offset {
 			// Efficiently copy the match dst[di-offset:di] into the dst slice.
@@ -103,6 +132,8 @@ func decodeBlock(dst, src []byte) (ret int) {
 		}
 		di += uint(copy(dst[di:di+mLen], expanded[:mLen]))
 	}
+
+	return int(di)
 }
 
 func u16(p []byte) uint { return uint(binary.LittleEndian.Uint16(p)) }
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/block.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/block.go
index 5e0c062ec0..e96465460c 100644
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/block.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/block.go
@@ -115,18 +115,23 @@ func (b *Blocks) initR(f *Frame, num int, src io.Reader) (chan []byte, error) {
 			block := NewFrameDataBlock(f)
 			cumx, err = block.Read(f, src, 0)
 			if err != nil {
+				block.Close(f)
 				break
 			}
 			// Recheck for an error as reading may be slow and uncompressing is expensive.
 			if b.ErrorR() != nil {
+				block.Close(f)
 				break
 			}
 			c := make(chan []byte)
 			blocks <- c
 			go func() {
-				data, err := block.Uncompress(f, size.Get(), false)
+				defer block.Close(f)
+				data, err := block.Uncompress(f, size.Get(), nil, false)
 				if err != nil {
 					b.closeR(err)
+					// Close the block channel to indicate an error.
+					close(c)
 				} else {
 					c <- data
 				}
@@ -147,13 +152,24 @@ func (b *Blocks) initR(f *Frame, num int, src io.Reader) (chan []byte, error) {
 	// on the returned channel.
 	go func(leg bool) {
 		defer close(blocks)
+		skipBlocks := false
 		for c := range blocks {
-			buf := <-c
+			buf, ok := <-c
+			if !ok {
+				// A closed channel indicates an error.
+				// All remaining channels should be discarded.
+				skipBlocks = true
+				continue
+			}
 			if buf == nil {
 				// Signal to end the loop.
 				close(c)
 				return
 			}
+			if skipBlocks {
+				// A previous error has occurred, skipping remaining channels.
+				continue
+			}
 			// Perform checksum now as the blocks are received in order.
 			if f.Descriptor.Flags.ContentChecksum() {
 				_, _ = f.checksum.Write(buf)
@@ -300,12 +316,12 @@ func (b *FrameDataBlock) Read(f *Frame, src io.Reader, cum uint32) (uint32, erro
 	return x, nil
 }
 
-func (b *FrameDataBlock) Uncompress(f *Frame, dst []byte, sum bool) ([]byte, error) {
+func (b *FrameDataBlock) Uncompress(f *Frame, dst, dict []byte, sum bool) ([]byte, error) {
 	if b.Size.Uncompressed() {
 		n := copy(dst, b.data)
 		dst = dst[:n]
 	} else {
-		n, err := lz4block.UncompressBlock(b.data, dst)
+		n, err := lz4block.UncompressBlock(b.data, dst, dict)
 		if err != nil {
 			return nil, err
 		}
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame.go
index cfbd5674d9..18192a9433 100644
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame.go
@@ -77,16 +77,16 @@ func (f *Frame) isLegacy() bool {
 	return f.Magic == frameMagicLegacy
 }
 
-func (f *Frame) InitR(src io.Reader, num int) (chan []byte, error) {
+func (f *Frame) ParseHeaders(src io.Reader) error {
 	if f.Magic > 0 {
 		// Header already read.
-		return nil, nil
+		return nil
 	}
 
 newFrame:
 	var err error
 	if f.Magic, err = f.readUint32(src); err != nil {
-		return nil, err
+		return err
 	}
 	switch m := f.Magic; {
 	case m == frameMagic || m == frameMagicLegacy:
@@ -94,19 +94,23 @@ newFrame:
 	case m>>8 == frameSkipMagic>>8:
 		skip, err := f.readUint32(src)
 		if err != nil {
-			return nil, err
+			return err
 		}
 		if _, err := io.CopyN(ioutil.Discard, src, int64(skip)); err != nil {
-			return nil, err
+			return err
 		}
 		goto newFrame
 	default:
-		return nil, lz4errors.ErrInvalidFrame
+		return lz4errors.ErrInvalidFrame
 	}
 	if err := f.Descriptor.initR(f, src); err != nil {
-		return nil, err
+		return err
 	}
 	f.checksum.Reset()
+	return nil
+}
+
+func (f *Frame) InitR(src io.Reader, num int) (chan []byte, error) {
 	return f.Blocks.initR(f, num, src)
 }
 
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero.go b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero.go
index 8d3206a87c..651d10c104 100644
--- a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero.go
@@ -1,5 +1,5 @@
 // Package xxh32 implements the very fast XXH hashing algorithm (32 bits version).
-// (https://github.com/Cyan4973/XXH/)
+// (ported from the reference implementation https://github.com/Cyan4973/xxHash/)
 package xxh32
 
 import (
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.s b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.s
index 0e9f146a36..c18ffd5743 100644
--- a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.s
@@ -1,16 +1,8 @@
 // +build !noasm
 
+#include "go_asm.h"
 #include "textflag.h"
 
-#define prime1		$2654435761
-#define prime2		$2246822519
-#define prime3		$3266489917
-#define prime4		$668265263
-#define prime5		$374761393
-
-#define prime1plus2	$606290984
-#define prime1minus	$1640531535
-
 // Register allocation.
 #define p	R0
 #define n	R1
@@ -106,12 +98,12 @@ TEXT ·ChecksumZero(SB), NOFRAME|NOSPLIT, $-4-16
 	MOVW input_base+0(FP), p
 	MOVW input_len+4(FP),  n
 
-	MOVW prime1, prime1r
-	MOVW prime2, prime2r
+	MOVW $const_prime1, prime1r
+	MOVW $const_prime2, prime2r
 
 	// Set up h for n < 16. It's tempting to say {ADD prime5, n, h}
 	// here, but that's a pseudo-op that generates a load through R11.
-	MOVW prime5, prime5r
+	MOVW $const_prime5, prime5r
 	ADD  prime5r, n, h
 	CMP  $0, n
 	BEQ  end
@@ -121,10 +113,10 @@ TEXT ·ChecksumZero(SB), NOFRAME|NOSPLIT, $-4-16
 	SUB.S $16, n
 	BMI   loop16done
 
-	MOVW prime1plus2, v1
-	MOVW prime2,      v2
-	MOVW $0,          v3
-	MOVW prime1minus, v4
+	ADD  prime1r, prime2r, v1
+	MOVW prime2r, v2
+	MOVW $0, v3
+	RSB  $0, prime1r, v4
 
 	TST $3, p
 	BNE loop16unaligned
@@ -154,9 +146,9 @@ loop16done:
 	ADD $16, n	// Restore number of bytes left.
 
 	SUB.S $4, n
-	MOVW  prime3, prime3r
+	MOVW  $const_prime3, prime3r
 	BMI   loop4done
-	MOVW  prime4, prime4r
+	MOVW  $const_prime4, prime4r
 
 	TST $3, p
 	BNE loop4unaligned
@@ -193,7 +185,7 @@ loop4done:
 	ADD.S $4, n	// Restore number of bytes left.
 	BEQ   end
 
-	MOVW prime5, prime5r
+	MOVW $const_prime5, prime5r
 
 loop1:
 	SUB.S $1, n
@@ -206,7 +198,7 @@ loop1:
 	BNE loop1
 
 end:
-	MOVW prime3, prime3r
+	MOVW $const_prime3, prime3r
 	EOR  h >> 15, h
 	MUL  prime2r, h
 	EOR  h >> 13, h
@@ -222,8 +214,8 @@ TEXT ·update(SB), NOFRAME|NOSPLIT, $-4-20
 	MOVW    v+0(FP), p
 	MOVM.IA (p), [v1, v2, v3, v4]
 
-	MOVW prime1, prime1r
-	MOVW prime2, prime2r
+	MOVW $const_prime1, prime1r
+	MOVW $const_prime2, prime2r
 
 	// Process buf, if not nil.
 	MOVW buf+4(FP), p
diff --git a/vendor/github.com/pierrec/lz4/v4/lz4.go b/vendor/github.com/pierrec/lz4/v4/lz4.go
index c585d4064f..a62022e088 100644
--- a/vendor/github.com/pierrec/lz4/v4/lz4.go
+++ b/vendor/github.com/pierrec/lz4/v4/lz4.go
@@ -35,7 +35,17 @@ func CompressBlockBound(n int) int {
 //
 // An error is returned if the source data is invalid or the destination buffer is too small.
 func UncompressBlock(src, dst []byte) (int, error) {
-	return lz4block.UncompressBlock(src, dst)
+	return lz4block.UncompressBlock(src, dst, nil)
+}
+
+// UncompressBlockWithDict uncompresses the source buffer into the destination one using a
+// dictionary, and returns the uncompressed size.
+//
+// The destination buffer must be sized appropriately.
+//
+// An error is returned if the source data is invalid or the destination buffer is too small.
+func UncompressBlockWithDict(src, dst, dict []byte) (int, error) {
+	return lz4block.UncompressBlock(src, dst, dict)
 }
 
 // A Compressor compresses data into the LZ4 block format.
diff --git a/vendor/github.com/pierrec/lz4/v4/options.go b/vendor/github.com/pierrec/lz4/v4/options.go
index 4e1b6703b5..57a44e767d 100644
--- a/vendor/github.com/pierrec/lz4/v4/options.go
+++ b/vendor/github.com/pierrec/lz4/v4/options.go
@@ -2,10 +2,11 @@ package lz4
 
 import (
 	"fmt"
-	"github.com/pierrec/lz4/v4/internal/lz4block"
-	"github.com/pierrec/lz4/v4/internal/lz4errors"
 	"reflect"
 	"runtime"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
 )
 
 //go:generate go run golang.org/x/tools/cmd/stringer -type=BlockSize,CompressionLevel -output options_gen.go
@@ -56,6 +57,13 @@ func BlockSizeOption(size BlockSize) Option {
 			}
 			w.frame.Descriptor.Flags.BlockSizeIndexSet(lz4block.Index(size))
 			return nil
+		case *CompressingReader:
+			size := uint32(size)
+			if !lz4block.IsValid(size) {
+				return fmt.Errorf("%w: %d", lz4errors.ErrOptionInvalidBlockSize, size)
+			}
+			w.frame.Descriptor.Flags.BlockSizeIndexSet(lz4block.Index(size))
+			return nil
 		}
 		return lz4errors.ErrOptionNotApplicable
 	}
@@ -71,6 +79,9 @@ func BlockChecksumOption(flag bool) Option {
 		case *Writer:
 			w.frame.Descriptor.Flags.BlockChecksumSet(flag)
 			return nil
+		case *CompressingReader:
+			w.frame.Descriptor.Flags.BlockChecksumSet(flag)
+			return nil
 		}
 		return lz4errors.ErrOptionNotApplicable
 	}
@@ -86,6 +97,9 @@ func ChecksumOption(flag bool) Option {
 		case *Writer:
 			w.frame.Descriptor.Flags.ContentChecksumSet(flag)
 			return nil
+		case *CompressingReader:
+			w.frame.Descriptor.Flags.ContentChecksumSet(flag)
+			return nil
 		}
 		return lz4errors.ErrOptionNotApplicable
 	}
@@ -103,6 +117,10 @@ func SizeOption(size uint64) Option {
 			w.frame.Descriptor.Flags.SizeSet(size > 0)
 			w.frame.Descriptor.ContentSize = size
 			return nil
+		case *CompressingReader:
+			w.frame.Descriptor.Flags.SizeSet(size > 0)
+			w.frame.Descriptor.ContentSize = size
+			return nil
 		}
 		return lz4errors.ErrOptionNotApplicable
 	}
@@ -161,6 +179,14 @@ func CompressionLevelOption(level CompressionLevel) Option {
 			}
 			w.level = lz4block.CompressionLevel(level)
 			return nil
+		case *CompressingReader:
+			switch level {
+			case Fast, Level1, Level2, Level3, Level4, Level5, Level6, Level7, Level8, Level9:
+			default:
+				return fmt.Errorf("%w: %d", lz4errors.ErrOptionInvalidCompressionLevel, level)
+			}
+			w.level = lz4block.CompressionLevel(level)
+			return nil
 		}
 		return lz4errors.ErrOptionNotApplicable
 	}
@@ -185,6 +211,9 @@ func OnBlockDoneOption(handler func(size int)) Option {
 		case *Reader:
 			rw.handler = handler
 			return nil
+		case *CompressingReader:
+			rw.handler = handler
+			return nil
 		}
 		return lz4errors.ErrOptionNotApplicable
 	}
diff --git a/vendor/github.com/pierrec/lz4/v4/reader.go b/vendor/github.com/pierrec/lz4/v4/reader.go
index 403aaf697a..275daad7cb 100644
--- a/vendor/github.com/pierrec/lz4/v4/reader.go
+++ b/vendor/github.com/pierrec/lz4/v4/reader.go
@@ -1,6 +1,7 @@
 package lz4
 
 import (
+	"bytes"
 	"io"
 
 	"github.com/pierrec/lz4/v4/internal/lz4block"
@@ -40,6 +41,7 @@ type Reader struct {
 	idx     int              // size of pending data
 	handler func(int)
 	cum     uint32
+	dict    []byte
 }
 
 func (*Reader) private() {}
@@ -77,6 +79,15 @@ func (r *Reader) isNotConcurrent() bool {
 }
 
 func (r *Reader) init() error {
+	err := r.frame.ParseHeaders(r.src)
+	if err != nil {
+		return err
+	}
+	if !r.frame.Descriptor.Flags.BlockIndependence() {
+		// We can't decompress dependent blocks concurrently.
+		// Instead of throwing an error to the user, silently drop concurrency
+		r.num = 1
+	}
 	data, err := r.frame.InitR(r.src, r.num)
 	if err != nil {
 		return err
@@ -162,10 +173,20 @@ func (r *Reader) read(buf []byte) (int, error) {
 		direct = true
 		dst = buf
 	}
-	dst, err = block.Uncompress(r.frame, dst, true)
+	dst, err = block.Uncompress(r.frame, dst, r.dict, true)
 	if err != nil {
 		return 0, err
 	}
+	if !r.frame.Descriptor.Flags.BlockIndependence() {
+		if len(r.dict)+len(dst) > 128*1024 {
+			preserveSize := 64*1024 - len(dst)
+			if preserveSize < 0 {
+				preserveSize = 0
+			}
+			r.dict = r.dict[len(r.dict)-preserveSize:]
+		}
+		r.dict = append(r.dict, dst...)
+	}
 	r.cum += uint32(len(dst))
 	if direct {
 		return len(dst), nil
@@ -175,10 +196,8 @@ func (r *Reader) read(buf []byte) (int, error) {
 }
 
 // Reset clears the state of the Reader r such that it is equivalent to its
-// initial state from NewReader, but instead writing to writer.
+// initial state from NewReader, but instead reading from reader.
 // No access to reader is performed.
-//
-// w.Close must be called before Reset.
 func (r *Reader) Reset(reader io.Reader) {
 	if r.data != nil {
 		lz4block.Put(r.data)
@@ -241,3 +260,16 @@ func (r *Reader) WriteTo(w io.Writer) (n int64, err error) {
 		}
 	}
 }
+
+// ValidFrameHeader returns a bool indicating if the given bytes slice matches a LZ4 header.
+func ValidFrameHeader(in []byte) (bool, error) {
+	f := lz4stream.NewFrame()
+	err := f.ParseHeaders(bytes.NewReader(in))
+	if err == nil {
+		return true, nil
+	}
+	if err == lz4errors.ErrInvalidFrame {
+		return false, nil
+	}
+	return false, err
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/writer.go b/vendor/github.com/pierrec/lz4/v4/writer.go
index 44a43d251b..4358adee10 100644
--- a/vendor/github.com/pierrec/lz4/v4/writer.go
+++ b/vendor/github.com/pierrec/lz4/v4/writer.go
@@ -49,12 +49,12 @@ func (w *Writer) Apply(options ...Option) (err error) {
 	default:
 		return lz4errors.ErrOptionClosedOrError
 	}
+	w.Reset(w.src)
 	for _, o := range options {
 		if err = o(w); err != nil {
 			return
 		}
 	}
-	w.Reset(w.src)
 	return
 }
 
@@ -65,10 +65,8 @@ func (w *Writer) isNotConcurrent() bool {
 // init sets up the Writer when in newState. It does not change the Writer state.
 func (w *Writer) init() error {
 	w.frame.InitW(w.src, w.num, w.legacy)
-	if true || !w.isNotConcurrent() {
-		size := w.frame.Descriptor.Flags.BlockSizeIndex()
-		w.data = size.Get()
-	}
+	size := w.frame.Descriptor.Flags.BlockSizeIndex()
+	w.data = size.Get()
 	w.idx = 0
 	return w.frame.Descriptor.Write(w.frame, w.src)
 }
@@ -89,7 +87,7 @@ func (w *Writer) Write(buf []byte) (n int, err error) {
 
 	zn := len(w.data)
 	for len(buf) > 0 {
-		if w.idx == 0 && len(buf) >= zn {
+		if w.isNotConcurrent() && w.idx == 0 && len(buf) >= zn {
 			// Avoid a copy as there is enough data for a block.
 			if err = w.write(buf[:zn], false); err != nil {
 				return
@@ -146,17 +144,20 @@ func (w *Writer) write(data []byte, safe bool) error {
 	return nil
 }
 
-// Close closes the Writer, flushing any unwritten data to the underlying io.Writer,
-// but does not close the underlying io.Writer.
-func (w *Writer) Close() (err error) {
+// Flush any buffered data to the underlying writer immediately.
+func (w *Writer) Flush() (err error) {
 	switch w.state.state {
 	case writeState:
 	case errorState:
 		return w.state.err
+	case newState:
+		if err = w.init(); w.state.next(err) {
+			return
+		}
 	default:
 		return nil
 	}
-	defer w.state.nextd(&err)
+
 	if w.idx > 0 {
 		// Flush pending data, disable w.data freeing as it is done later on.
 		if err = w.write(w.data[:w.idx], false); err != nil {
@@ -164,13 +165,22 @@ func (w *Writer) Close() (err error) {
 		}
 		w.idx = 0
 	}
-	err = w.frame.CloseW(w.src, w.num)
+	return nil
+}
+
+// Close closes the Writer, flushing any unwritten data to the underlying writer
+// without closing it.
+func (w *Writer) Close() error {
+	if err := w.Flush(); err != nil {
+		return err
+	}
+	err := w.frame.CloseW(w.src, w.num)
 	// It is now safe to free the buffer.
 	if w.data != nil {
 		lz4block.Put(w.data)
 		w.data = nil
 	}
-	return
+	return err
 }
 
 // Reset clears the state of the Writer w such that it is equivalent to its
@@ -228,6 +238,5 @@ func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) {
 			data = size.Get()
 		}
 	}
-	err = w.Close()
 	return
 }
diff --git a/vendor/github.com/sorairolake/lzip-go/.bumpversion.toml b/vendor/github.com/sorairolake/lzip-go/.bumpversion.toml
new file mode 100644
index 0000000000..76cfb0f02e
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/.bumpversion.toml
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: 2024 Shun Sakai
+#
+# SPDX-License-Identifier: Apache-2.0 OR MIT
+
+[tool.bumpversion]
+current_version = "0.3.5"
+
+[[tool.bumpversion.files]]
+filename = "cmd/glzip/cli.go"
+
+[[tool.bumpversion.files]]
+filename = "cmd/glzip/testdata/version.ct"
+
+[[tool.bumpversion.files]]
+filename = "docs/man/man1/glzip.1.adoc"
diff --git a/vendor/github.com/sorairolake/lzip-go/.gitignore b/vendor/github.com/sorairolake/lzip-go/.gitignore
new file mode 100644
index 0000000000..caead21ccb
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/.gitignore
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: 2024 Shun Sakai
+#
+# SPDX-License-Identifier: Apache-2.0 OR MIT
+
+# Binaries for programs and plugins
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+/glzip
+
+# Test binary, built with `go test -c`
+*.test
+
+# Generated man page
+glzip.1
+
+# GoReleaser
+dist/
diff --git a/vendor/github.com/sorairolake/lzip-go/.goreleaser.yaml b/vendor/github.com/sorairolake/lzip-go/.goreleaser.yaml
new file mode 100644
index 0000000000..42c78a1615
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/.goreleaser.yaml
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: 2024 Shun Sakai
+#
+# SPDX-License-Identifier: Apache-2.0 OR MIT
+
+version: 1
+
+before:
+  hooks:
+    - go mod tidy
+    - asciidoctor -b manpage docs/man/man1/glzip.1.adoc
+
+builds:
+  - main: ./cmd/glzip
+    binary: glzip
+    flags:
+      - -trimpath
+    ldflags:
+      - -s -w
+    env:
+      - CGO_ENABLED=0
+    goos:
+      - darwin
+      - freebsd
+      - linux
+      - openbsd
+      - windows
+    goarch:
+      - amd64
+      - arm64
+
+archives:
+  - format: tar.zst
+    # use zip for windows archives
+    format_overrides:
+      - goos: windows
+        format: zip
+    files:
+      - AUTHORS.adoc
+      - CHANGELOG.adoc
+      - CONTRIBUTING.adoc
+      - docs/man/man1/glzip.1
+      - LICENSES/*
+      - README.md
+
+checksum:
+  algorithm: sha3-512
+
+changelog:
+  sort: asc
+  filters:
+    exclude:
+      - "^docs:"
+      - "^test:"
diff --git a/vendor/github.com/sorairolake/lzip-go/AUTHORS.adoc b/vendor/github.com/sorairolake/lzip-go/AUTHORS.adoc
new file mode 100644
index 0000000000..53949a7d20
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/AUTHORS.adoc
@@ -0,0 +1,9 @@
+// SPDX-FileCopyrightText: 2024 Shun Sakai
+//
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+= List of Authors
+
+== Original author
+
+* https://github.com/sorairolake[Shun Sakai]
diff --git a/vendor/github.com/sorairolake/lzip-go/CHANGELOG.adoc b/vendor/github.com/sorairolake/lzip-go/CHANGELOG.adoc
new file mode 100644
index 0000000000..4834bbdf2c
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/CHANGELOG.adoc
@@ -0,0 +1,71 @@
+// SPDX-FileCopyrightText: 2024 Shun Sakai
+//
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+= Changelog
+:toc: preamble
+:project-url: https://github.com/sorairolake/lzip-go
+:compare-url: {project-url}/compare
+:issue-url: {project-url}/issues
+:pull-request-url: {project-url}/pull
+
+All notable changes to this project will be documented in this file.
+
+The format is based on https://keepachangelog.com/[Keep a Changelog], and this
+project adheres to https://semver.org/[Semantic Versioning].
+
+== {compare-url}/v0.3.4\...v0.3.5[0.3.5] - 2024-08-04
+
+=== Changed
+
+* Update man pages
+
+== {compare-url}/v0.3.3\...v0.3.4[0.3.4] - 2024-05-02
+
+=== Changed
+
+* Change to provide pre-built binaries ({pull-request-url}/21[#21])
+
+== {compare-url}/v0.3.2\...v0.3.3[0.3.3] - 2024-04-16
+
+=== Changed
+
+* Update document
+
+== {compare-url}/v0.3.1\...v0.3.2[0.3.2] - 2024-04-10
+
+=== Changed
+
+* Ungroup constants ({pull-request-url}/13[#13])
+
+== {compare-url}/v0.3.0\...v0.3.1[0.3.1] - 2024-04-08
+
+=== Changed
+
+* Update document for errors ({pull-request-url}/11[#11])
+
+== {compare-url}/v0.2.0\...v0.3.0[0.3.0] - 2024-04-07
+
+=== Changed
+
+* Change errors to include details ({pull-request-url}/8[#8])
+
+== {compare-url}/v0.1.0\...v0.2.0[0.2.0] - 2024-04-05
+
+=== Added
+
+* Add a simple command-line utility for reading and writing of lzip format
+  compressed files ({pull-request-url}/4[#4])
+
+=== Changed
+
+* Export constants regarding the dictionary size and the member size
+  ({pull-request-url}/3[#3])
+* Change the type of `WriterOptions.DictSize` from `int` to `uint32`
+  ({pull-request-url}/5[#5])
+
+== {project-url}/releases/tag/v0.1.0[0.1.0] - 2024-04-04
+
+=== Added
+
+* Initial release
diff --git a/vendor/github.com/sorairolake/lzip-go/CODE_OF_CONDUCT.md b/vendor/github.com/sorairolake/lzip-go/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000..4063f63044
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/CODE_OF_CONDUCT.md
@@ -0,0 +1,138 @@
+<!--
+SPDX-FileCopyrightText: 2024 Shun Sakai
+
+SPDX-License-Identifier: Apache-2.0 OR MIT
+-->
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+<sorairolake@protonmail.ch>.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
diff --git a/vendor/github.com/sorairolake/lzip-go/CONTRIBUTING.adoc b/vendor/github.com/sorairolake/lzip-go/CONTRIBUTING.adoc
new file mode 100644
index 0000000000..1fb6474552
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/CONTRIBUTING.adoc
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: 2024 Shun Sakai
+//
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+= Contribution Guide
+:git-flow-url: https://nvie.com/posts/a-successful-git-branching-model/
+:commit-messages-guide-url: https://github.com/RomuloOliveira/commit-messages-guide
+:conventionalcommits-url: https://www.conventionalcommits.org/en/v1.0.0/
+
+Thank you for your interest in contributing to this project! If you would like
+to contribute to this project, please follow the instructions below if possible.
+
+== Branching model
+
+The branching model of this project is based on the {git-flow-url}[git-flow].
+
+== Style guides
+
+=== Commit message
+
+Please see the {commit-messages-guide-url}[Commit messages guide] and the
+{conventionalcommits-url}[Conventional Commits].
+
+== Submitting a pull request
+
+. Create a working branch from the `develop` branch. The branch name should be
+  something other than `develop` or `master`.
+. Create your patch. If your change is a feature or a bugfix, please add a test
+  case if possible. Note that the change must pass the CI.
+. Please update the copyright information if possible. This project is
+  compliant with version 3.2 of the
+  https://reuse.software/spec/[_REUSE Specification_].
+  https://github.com/fsfe/reuse-tool[`reuse`] is useful for updating the
+  copyright information.
+. Please update the link:CHANGELOG.adoc[Changelog] if possible.
+. Please read and agree to follow the link:CODE_OF_CONDUCT.md[Code of Conduct].
+
+== Development
+
+=== Useful development tools
+
+The https://github.com/casey/just[just] command runner can be used. Run
+`just --list` for more details.
+
+.Run tests
+[source,sh]
+----
+just test
+----
+
+.Run the formatter
+[source,sh]
+----
+just fmt
+----
+
+.Run the linter
+[source,sh]
+----
+just lint
+----
diff --git a/vendor/github.com/sorairolake/lzip-go/LICENSE b/vendor/github.com/sorairolake/lzip-go/LICENSE
new file mode 100644
index 0000000000..eec8edd36e
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/LICENSE
@@ -0,0 +1,225 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+---
+
+MIT License
+
+Copyright (c) 2024 Shun Sakai
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/vendor/github.com/sorairolake/lzip-go/README.md b/vendor/github.com/sorairolake/lzip-go/README.md
new file mode 100644
index 0000000000..1681287575
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/README.md
@@ -0,0 +1,119 @@
+<!--
+SPDX-FileCopyrightText: 2024 Shun Sakai
+
+SPDX-License-Identifier: Apache-2.0 OR MIT
+-->
+
+# lzip-go
+
+[![CI][ci-badge]][ci-url]
+[![Go Reference][reference-badge]][reference-url]
+![Go version][go-version-badge]
+
+**lzip-go** is an implementation of the [lzip compressed format] written in
+pure [Go].
+
+This package supports reading and writing of lzip compressed streams.
+
+## Usage
+
+To install this package:
+
+```sh
+go get -u github.com/sorairolake/lzip-go
+```
+
+### Example
+
+Please see [`example_test.go`].
+
+### Documentation
+
+See the [documentation][reference-url] for more details.
+
+## Command-line utility
+
+This package includes a simple command-line utility for reading and writing of
+lzip format compressed files.
+
+### Installation
+
+#### From source
+
+```sh
+go install github.com/sorairolake/lzip-go/cmd/glzip@latest
+```
+
+#### From binaries
+
+The [release page] contains pre-built binaries for Linux, macOS, Windows and
+others.
+
+#### How to build
+
+To build the command-line utility:
+
+```sh
+just build-cmd
+```
+
+To build a man page:
+
+```sh
+just build-man
+```
+
+The man page is generated in `docs/man/man1`. Note that [Asciidoctor] is
+required when building the man page.
+
+### Usage
+
+Please see [`glzip(1)`].
+
+## Minimum Go version
+
+This package requires the minimum version of Go 1.22.
+
+## Changelog
+
+Please see [CHANGELOG.adoc].
+
+## Contributing
+
+Please see [CONTRIBUTING.adoc].
+
+## Acknowledgment
+
+The API of this package is based on the [`compress/gzip`] package.
+
+This package uses the [`github.com/ulikunitz/xz/lzma`] package to encode and
+decode LZMA streams.
+
+## License
+
+Copyright &copy; 2024 Shun Sakai (see [AUTHORS.adoc])
+
+This package is distributed under the terms of either the _Apache License 2.0_
+or the _MIT License_.
+
+This project is compliant with version 3.2 of the [_REUSE Specification_]. See
+copyright notices of individual files for more details on copyright and
+licensing information.
+
+[ci-badge]: https://img.shields.io/github/actions/workflow/status/sorairolake/lzip-go/CI.yaml?branch=develop&style=for-the-badge&logo=github&label=CI
+[ci-url]: https://github.com/sorairolake/lzip-go/actions?query=branch%3Adevelop+workflow%3ACI++
+[reference-badge]: https://img.shields.io/badge/Go-Reference-steelblue?style=for-the-badge&logo=go
+[reference-url]: https://pkg.go.dev/github.com/sorairolake/lzip-go
+[go-version-badge]: https://img.shields.io/github/go-mod/go-version/sorairolake/lzip-go?style=for-the-badge&logo=go
+[lzip compressed format]: https://www.nongnu.org/lzip/manual/lzip_manual.html#File-format
+[Go]: https://go.dev/
+[`example_test.go`]: example_test.go
+[release page]: https://github.com/sorairolake/lzip-go/releases
+[Asciidoctor]: https://asciidoctor.org/
+[`glzip(1)`]: docs/man/man1/glzip.1.adoc
+[CHANGELOG.adoc]: CHANGELOG.adoc
+[CONTRIBUTING.adoc]: CONTRIBUTING.adoc
+[`compress/gzip`]: https://pkg.go.dev/compress/gzip
+[`github.com/ulikunitz/xz/lzma`]: https://pkg.go.dev/github.com/ulikunitz/xz/lzma
+[AUTHORS.adoc]: AUTHORS.adoc
+[_REUSE Specification_]: https://reuse.software/spec/
diff --git a/vendor/github.com/sorairolake/lzip-go/error.go b/vendor/github.com/sorairolake/lzip-go/error.go
new file mode 100644
index 0000000000..900a8fd9f5
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/error.go
@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: 2024 Shun Sakai
+//
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+package lzip
+
+import "errors"
+
+// ErrInvalidMagic represents an error due to the magic number was invalid.
+var ErrInvalidMagic = errors.New("lzip: invalid magic number")
+
+// UnsupportedVersionError represents an error due to the version number stored
+// in the header indicated the lzip format which is not supported by this
+// package.
+type UnsupportedVersionError struct {
+	// Version represents the obtained version number.
+	Version byte
+}
+
+// Error returns a string representation of an [UnsupportedVersionError].
+func (e *UnsupportedVersionError) Error() string {
+	return "lzip: unsupported version number"
+}
+
+// UnknownVersionError represents an error due to the version number stored in
+// the header was not recognized by this package.
+type UnknownVersionError struct {
+	// Version represents the obtained version number.
+	Version byte
+}
+
+// Error returns a string representation of an [UnknownVersionError].
+func (e *UnknownVersionError) Error() string {
+	return "lzip: unknown version number"
+}
+
+// DictSizeTooSmallError represents an error due to the dictionary size was
+// smaller than 4 KiB.
+type DictSizeTooSmallError struct {
+	// DictSize represents the obtained dictionary size.
+	DictSize uint32
+}
+
+// Error returns a string representation of a [DictSizeTooSmallError].
+func (e *DictSizeTooSmallError) Error() string {
+	return "lzip: dictionary size is too small"
+}
+
+// DictSizeTooLargeError represents an error due to the dictionary size was
+// larger than 512 MiB.
+type DictSizeTooLargeError struct {
+	// DictSize represents the obtained dictionary size.
+	DictSize uint32
+}
+
+// Error returns a string representation of a [DictSizeTooLargeError].
+func (e *DictSizeTooLargeError) Error() string {
+	return "lzip: dictionary size is too large"
+}
+
+// InvalidCRCError represents an error due to a CRC of the original
+// uncompressed data mismatched.
+type InvalidCRCError struct {
+	// CRC represents the obtained CRC.
+	CRC uint32
+}
+
+// Error returns a string representation of an [InvalidCRCError].
+func (e *InvalidCRCError) Error() string {
+	return "lzip: CRC mismatch"
+}
+
+// InvalidDataSizeError represents an error due to the size of the original
+// uncompressed data stored in the trailer and the actual size of it mismatched.
+type InvalidDataSizeError struct {
+	// DataSize represents the obtained data size.
+	DataSize uint64
+}
+
+// Error returns a string representation of an [InvalidDataSizeError].
+func (e *InvalidDataSizeError) Error() string {
+	return "lzip: data size mismatch"
+}
+
+// InvalidMemberSizeError represents an error due to the total size of the
+// member stored in the trailer and the actual total size of it mismatched.
+type InvalidMemberSizeError struct {
+	// MemberSize represents the obtained member size.
+	MemberSize uint64
+}
+
+// Error returns a string representation of an [InvalidMemberSizeError].
+func (e *InvalidMemberSizeError) Error() string {
+	return "lzip: member size mismatch"
+}
+
+// MemberSizeTooLargeError represents an error due to the member size was
+// larger than 2 PiB.
+type MemberSizeTooLargeError struct {
+	// MemberSize represents the obtained member size.
+	MemberSize uint64
+}
+
+// Error returns a string representation of a [MemberSizeTooLargeError].
+func (e *MemberSizeTooLargeError) Error() string {
+	return "lzip: member size is too large"
+}
diff --git a/vendor/github.com/sorairolake/lzip-go/go.sum.license b/vendor/github.com/sorairolake/lzip-go/go.sum.license
new file mode 100644
index 0000000000..df26b1a7ab
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/go.sum.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2024 Shun Sakai
+
+SPDX-License-Identifier: Apache-2.0 OR MIT
diff --git a/vendor/github.com/sorairolake/lzip-go/justfile b/vendor/github.com/sorairolake/lzip-go/justfile
new file mode 100644
index 0000000000..1193103439
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/justfile
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: 2024 Shun Sakai
+#
+# SPDX-License-Identifier: Apache-2.0 OR MIT
+
+alias all := default
+alias build-cmd := build-cmd-debug
+
+# Run default recipe
+default: test
+
+# Remove generated artifacts
+@clean:
+    go clean
+
+# Run tests
+@test:
+    go test ./...
+
+# Run `golangci-lint run`
+@golangci-lint:
+    golangci-lint run -E gofmt,goimports
+
+# Run the formatter
+fmt: gofmt goimports
+
+# Run `go fmt`
+@gofmt:
+    go fmt ./...
+
+# Run `goimports`
+@goimports:
+    fd -e go -x goimports -w
+
+# Run the linter
+lint: vet staticcheck
+
+# Run `go vet`
+@vet:
+    go vet ./...
+
+# Run `staticcheck`
+@staticcheck:
+    staticcheck ./...
+
+# Build `glzip` command in debug mode
+@build-cmd-debug $CGO_ENABLED="0":
+    go build ./cmd/glzip
+
+# Build `glzip` command in release mode
+@build-cmd-release $CGO_ENABLED="0":
+    go build -ldflags="-s -w" -trimpath ./cmd/glzip
+
+# Build `glzip(1)`
+@build-man:
+    asciidoctor -b manpage docs/man/man1/glzip.1.adoc
+
+# Run the linter for GitHub Actions workflow files
+@lint-github-actions:
+    actionlint -verbose
+
+# Run the formatter for the README
+@fmt-readme:
+    npx prettier -w README.md
+
+# Increment the version
+@bump part:
+    bump-my-version bump {{part}}
diff --git a/vendor/github.com/sorairolake/lzip-go/lzip.go b/vendor/github.com/sorairolake/lzip-go/lzip.go
new file mode 100644
index 0000000000..d0938eedd1
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/lzip.go
@@ -0,0 +1,76 @@
+// SPDX-FileCopyrightText: 2024 Shun Sakai
+//
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+// Package lzip implements reading and writing of lzip format compressed files.
+// The package supports version 1 of the specification.
+//
+// See the following for the specification:
+//
+//   - https://www.nongnu.org/lzip/manual/lzip_manual.html#File-format
+//   - https://datatracker.ietf.org/doc/html/draft-diaz-lzip-09#section-2
+package lzip
+
+import (
+	"math/bits"
+
+	"github.com/ulikunitz/xz/lzma"
+)
+
+const (
+	headerSize  = 6
+	trailerSize = 20
+)
+
+const magic = "LZIP"
+const magicSize = 4
+
+type version byte
+
+const (
+	version0 version = iota
+	version1
+)
+
+// MinDictSize is the minimum dictionary size, which is 4 KiB.
+const MinDictSize = lzma.MinDictCap
+
+// MaxDictSize is the maximum dictionary size, which is 512 MiB.
+const MaxDictSize = 1 << 29
+
+// DefaultDictSize is the default dictionary size, which is 8 MiB.
+const DefaultDictSize = 1 << 23
+
+// MaxMemberSize is the maximum member size, which is 2 PiB.
+const MaxMemberSize = 1 << 51
+
+type header struct {
+	magic [magicSize]byte
+	version
+	dictSize byte
+}
+
+func newHeader(dictSize uint32) *header {
+	ds := bits.Len32(dictSize - 1)
+
+	if dictSize > MinDictSize {
+		base := 1 << dictSize
+		frac := base / 16
+
+		for i := 7; i >= 1; i-- {
+			if (base - (i * frac)) >= ds {
+				ds |= i << 5
+			}
+		}
+	}
+
+	z := &header{[magicSize]byte([]byte(magic)), version1, byte(ds)}
+
+	return z
+}
+
+type trailer struct {
+	crc        uint32
+	dataSize   uint64
+	memberSize uint64
+}
diff --git a/vendor/github.com/sorairolake/lzip-go/reader.go b/vendor/github.com/sorairolake/lzip-go/reader.go
new file mode 100644
index 0000000000..af14464ed9
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/reader.go
@@ -0,0 +1,124 @@
+// SPDX-FileCopyrightText: 2024 Shun Sakai
+//
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+package lzip
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"hash/crc32"
+	"io"
+	"slices"
+
+	"github.com/ulikunitz/xz/lzma"
+)
+
+// Reader is an [io.Reader] that can be read to retrieve uncompressed data from
+// a lzip-format compressed file.
+type Reader struct {
+	r            io.Reader
+	decompressor *lzma.Reader
+	trailer
+}
+
+// NewReader creates a new [Reader] reading the given reader.
+func NewReader(r io.Reader) (*Reader, error) {
+	z := new(Reader)
+
+	var header [headerSize]byte
+	if _, err := r.Read(header[:]); err != nil {
+		return nil, err
+	}
+
+	if !slices.Equal(header[:magicSize], []byte(magic)) {
+		return nil, ErrInvalidMagic
+	}
+
+	switch v := header[4]; v {
+	case 0:
+		return nil, &UnsupportedVersionError{v}
+	case 1:
+	default:
+		return nil, &UnknownVersionError{v}
+	}
+
+	dictSize := uint32(1 << (header[5] & 0x1f))
+	dictSize -= (dictSize / 16) * uint32((header[5]>>5)&0x07)
+
+	switch {
+	case dictSize < MinDictSize:
+		return nil, &DictSizeTooSmallError{dictSize}
+	case dictSize > MaxDictSize:
+		return nil, &DictSizeTooLargeError{dictSize}
+	}
+
+	rb, err := io.ReadAll(r)
+
+	if err != nil {
+		return nil, err
+	}
+
+	var lzmaHeader [lzma.HeaderLen]byte
+	lzmaHeader[0] = lzma.Properties{LC: 3, LP: 0, PB: 2}.Code()
+	binary.LittleEndian.PutUint32(lzmaHeader[1:5], dictSize)
+	copy(lzmaHeader[5:], rb[len(rb)-16:len(rb)-8])
+
+	z.trailer.memberSize = uint64(headerSize + len(rb))
+	if memberSize := z.trailer.memberSize; memberSize > MaxMemberSize {
+		return nil, &MemberSizeTooLargeError{memberSize}
+	}
+
+	rb = slices.Concat(lzmaHeader[:], rb)
+
+	r = bytes.NewReader(rb)
+
+	z.decompressor, err = lzma.NewReader(r)
+	if err != nil {
+		return nil, err
+	}
+
+	z.r = r
+
+	return z, nil
+}
+
+// Read reads uncompressed data from the stream.
+func (z *Reader) Read(p []byte) (n int, err error) {
+	for n == 0 {
+		n, err = z.decompressor.Read(p)
+		if err != nil {
+			return n, err
+		}
+
+		z.trailer.crc = crc32.Update(z.trailer.crc, crc32.IEEETable, p[:n])
+		z.trailer.dataSize += uint64(n)
+
+		if !errors.Is(err, io.EOF) {
+			return n, err
+		}
+
+		var trailer [trailerSize]byte
+		if _, err := io.ReadFull(z.r, trailer[:]); err != nil {
+			return n, err
+		}
+
+		crc := binary.LittleEndian.Uint32(trailer[:4])
+		if crc != z.trailer.crc {
+			return n, &InvalidCRCError{crc}
+		}
+
+		dataSize := binary.LittleEndian.Uint64(trailer[4:12])
+		if dataSize != z.trailer.dataSize {
+			return n, &InvalidDataSizeError{dataSize}
+		}
+
+		memberSize := binary.LittleEndian.Uint64(trailer[12:])
+		if memberSize != z.trailer.memberSize {
+			return n, &InvalidMemberSizeError{memberSize}
+		}
+	}
+
+	return n, nil
+}
diff --git a/vendor/github.com/sorairolake/lzip-go/writer.go b/vendor/github.com/sorairolake/lzip-go/writer.go
new file mode 100644
index 0000000000..4f1777f36f
--- /dev/null
+++ b/vendor/github.com/sorairolake/lzip-go/writer.go
@@ -0,0 +1,147 @@
+// SPDX-FileCopyrightText: 2024 Shun Sakai
+//
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+package lzip
+
+import (
+	"bytes"
+	"encoding/binary"
+	"hash/crc32"
+	"io"
+
+	"github.com/ulikunitz/xz/lzma"
+)
+
+// Writer is an [io.WriteCloser] that can be written to retrieve a lzip-format
+// compressed file from data.
+type Writer struct {
+	w           io.Writer
+	compressor  *lzma.Writer
+	buf         bytes.Buffer
+	header      *header
+	wroteHeader bool
+	trailer
+	closed bool
+}
+
+// WriterOptions configures [Writer].
+type WriterOptions struct {
+	// DictSize sets the dictionary size.
+	DictSize uint32
+}
+
+func newWriterOptions() *WriterOptions {
+	opt := &WriterOptions{DefaultDictSize}
+
+	return opt
+}
+
+// Verify checks if [WriterOptions] is valid.
+func (o *WriterOptions) Verify() error {
+	switch dictSize := o.DictSize; {
+	case dictSize < MinDictSize:
+		return &DictSizeTooSmallError{dictSize}
+	case dictSize > MaxDictSize:
+		return &DictSizeTooLargeError{dictSize}
+	}
+
+	return nil
+}
+
+// NewWriter creates a new [Writer] writing the given writer.
+//
+// This uses the default parameters.
+func NewWriter(w io.Writer) *Writer {
+	opt := newWriterOptions()
+
+	z, err := NewWriterOptions(w, opt)
+	if err != nil {
+		panic(err)
+	}
+
+	return z
+}
+
+// NewWriterOptions creates a new [Writer] writing the given writer.
+//
+// This uses the given [WriterOptions].
+func NewWriterOptions(w io.Writer, opt *WriterOptions) (*Writer, error) {
+	if err := opt.Verify(); err != nil {
+		return nil, err
+	}
+
+	z := &Writer{w: w}
+
+	compressor, err := lzma.WriterConfig{DictCap: int(opt.DictSize)}.NewWriter(&z.buf)
+	if err != nil {
+		return nil, err
+	}
+
+	z.compressor = compressor
+
+	header := newHeader(opt.DictSize)
+	z.header = header
+
+	return z, nil
+}
+
+// Write compresses the given uncompressed data.
+func (z *Writer) Write(p []byte) (int, error) {
+	if !z.wroteHeader {
+		z.wroteHeader = true
+
+		var header [headerSize]byte
+
+		copy(header[:magicSize], z.header.magic[:])
+		header[4] = byte(z.header.version)
+		header[5] = z.header.dictSize
+
+		if _, err := z.w.Write(header[:]); err != nil {
+			return 0, err
+		}
+	}
+
+	n, err := z.compressor.Write(p)
+	if err != nil {
+		return n, err
+	}
+
+	z.trailer.crc = crc32.Update(z.trailer.crc, crc32.IEEETable, p)
+	z.trailer.dataSize += uint64(len(p))
+
+	return n, nil
+}
+
+// Close closes the [Writer] and writing the lzip trailer. It does not close
+// the underlying [io.Writer].
+func (z *Writer) Close() error {
+	if z.closed {
+		return nil
+	}
+
+	z.closed = true
+
+	if err := z.compressor.Close(); err != nil {
+		return err
+	}
+
+	cb := z.buf.Bytes()[lzma.HeaderLen:]
+	if _, err := z.w.Write(cb); err != nil {
+		return err
+	}
+
+	var trailer [trailerSize]byte
+
+	binary.LittleEndian.PutUint32(trailer[:4], z.trailer.crc)
+	binary.LittleEndian.PutUint64(trailer[4:12], z.trailer.dataSize)
+	binary.LittleEndian.PutUint64(trailer[12:], headerSize+uint64(len(cb))+trailerSize)
+
+	if memberSize := binary.LittleEndian.Uint64(trailer[12:]); memberSize > MaxMemberSize {
+		return &MemberSizeTooLargeError{memberSize}
+	}
+
+	_, err := z.w.Write(trailer[:])
+
+	return err
+}
diff --git a/vendor/github.com/xi2/xz/LICENSE b/vendor/github.com/xi2/xz/LICENSE
deleted file mode 100644
index b56f2e6a2c..0000000000
--- a/vendor/github.com/xi2/xz/LICENSE
+++ /dev/null
@@ -1,18 +0,0 @@
-Licensing of github.com/xi2/xz
-==============================
-
-    This Go package is a modified version of
-
-        XZ Embedded  <http://tukaani.org/xz/embedded.html>
-
-    The contents of the testdata directory are modified versions of
-    the test files from
-
-        XZ Utils  <http://tukaani.org/xz/>
-
-    All the files in this package have been written by Michael Cross,
-    Lasse Collin and/or Igor PavLov. All these files have been put
-    into the public domain. You can do whatever you want with these
-    files.
-
-    This software is provided "as is", without any warranty.
diff --git a/vendor/github.com/xi2/xz/README.md b/vendor/github.com/xi2/xz/README.md
deleted file mode 100644
index 2190af553d..0000000000
--- a/vendor/github.com/xi2/xz/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Xz
-
-Package xz implements XZ decompression natively in Go.
-
-Documentation at <https://godoc.org/github.com/xi2/xz>.
-
-Download and install with `go get github.com/xi2/xz`.
-
-If you need compression as well as decompression, you might want to
-look at <https://github.com/ulikunitz/xz>.
diff --git a/vendor/go4.org/AUTHORS b/vendor/go4.org/AUTHORS
new file mode 100644
index 0000000000..d1ad485f52
--- /dev/null
+++ b/vendor/go4.org/AUTHORS
@@ -0,0 +1,8 @@
+# This is the official list of go4 authors for copyright purposes.
+# This is distinct from the CONTRIBUTORS file, which is the list of
+# people who have contributed, even if they don't own the copyright on
+# their work.
+
+Mathieu Lonjaret <mathieu.lonjaret@gmail.com>
+Daniel Theophanes <kardianos@gmail.com>
+Google
diff --git a/vendor/go4.org/LICENSE b/vendor/go4.org/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/vendor/go4.org/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/vendor/go4.org/readerutil/bufreaderat.go b/vendor/go4.org/readerutil/bufreaderat.go
new file mode 100644
index 0000000000..0e0a95c723
--- /dev/null
+++ b/vendor/go4.org/readerutil/bufreaderat.go
@@ -0,0 +1,48 @@
+/*
+Copyright 2018 The go4 Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package readerutil
+
+import "io"
+
+// NewBufferingReaderAt returns an io.ReaderAt that reads from r as
+// necessary and keeps a copy of all data read in memory.
+func NewBufferingReaderAt(r io.Reader) io.ReaderAt {
+	return &bufReaderAt{r: r}
+}
+
+type bufReaderAt struct {
+	r   io.Reader
+	buf []byte
+}
+
+func (br *bufReaderAt) ReadAt(p []byte, off int64) (n int, err error) {
+	endOff := off + int64(len(p))
+	need := endOff - int64(len(br.buf))
+	if need > 0 {
+		buf := make([]byte, need)
+		var rn int
+		rn, err = io.ReadFull(br.r, buf)
+		br.buf = append(br.buf, buf[:rn]...)
+	}
+	if int64(len(br.buf)) >= off {
+		n = copy(p, br.buf[off:])
+	}
+	if n == len(p) {
+		err = nil
+	}
+	return
+}
diff --git a/vendor/go4.org/readerutil/countingreader.go b/vendor/go4.org/readerutil/countingreader.go
new file mode 100644
index 0000000000..bc81303904
--- /dev/null
+++ b/vendor/go4.org/readerutil/countingreader.go
@@ -0,0 +1,32 @@
+/*
+Copyright 2011 The Go4 Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package readerutil
+
+import "io"
+
+// CountingReader wraps a Reader, incrementing N by the number of
+// bytes read. No locking is performed.
+type CountingReader struct {
+	Reader io.Reader
+	N      *int64
+}
+
+func (cr CountingReader) Read(p []byte) (n int, err error) {
+	n, err = cr.Reader.Read(p)
+	*cr.N += int64(n)
+	return
+}
diff --git a/vendor/go4.org/readerutil/fakeseeker.go b/vendor/go4.org/readerutil/fakeseeker.go
new file mode 100644
index 0000000000..7dca839784
--- /dev/null
+++ b/vendor/go4.org/readerutil/fakeseeker.go
@@ -0,0 +1,70 @@
+/*
+Copyright 2014 The Perkeep Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package readerutil
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"os"
+)
+
+// fakeSeeker can seek to the ends but any read not at the current
+// position will fail.
+type fakeSeeker struct {
+	r    io.Reader
+	size int64
+
+	fakePos int64
+	realPos int64
+}
+
+// NewFakeSeeker returns a ReadSeeker that can pretend to Seek (based
+// on the provided total size of the reader's content), but any reads
+// will fail if the fake seek position doesn't match reality.
+func NewFakeSeeker(r io.Reader, size int64) io.ReadSeeker {
+	return &fakeSeeker{r: r, size: size}
+}
+
+func (fs *fakeSeeker) Seek(offset int64, whence int) (int64, error) {
+	var newo int64
+	switch whence {
+	default:
+		return 0, errors.New("invalid whence")
+	case os.SEEK_SET:
+		newo = offset
+	case os.SEEK_CUR:
+		newo = fs.fakePos + offset
+	case os.SEEK_END:
+		newo = fs.size + offset
+	}
+	if newo < 0 {
+		return 0, errors.New("negative seek")
+	}
+	fs.fakePos = newo
+	return newo, nil
+}
+
+func (fs *fakeSeeker) Read(p []byte) (n int, err error) {
+	if fs.fakePos != fs.realPos {
+		return 0, fmt.Errorf("attempt to read from fake seek offset %d; real offset is %d", fs.fakePos, fs.realPos)
+	}
+	n, err = fs.r.Read(p)
+	fs.fakePos += int64(n)
+	fs.realPos += int64(n)
+	return
+}
diff --git a/vendor/go4.org/readerutil/multireaderat.go b/vendor/go4.org/readerutil/multireaderat.go
new file mode 100644
index 0000000000..33d148c073
--- /dev/null
+++ b/vendor/go4.org/readerutil/multireaderat.go
@@ -0,0 +1,91 @@
+/*
+Copyright 2016 The go4 Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package readerutil
+
+import (
+	"io"
+	"sort"
+)
+
+// NewMultiReaderAt is like io.MultiReader but produces a ReaderAt
+// (and Size), instead of just a reader.
+func NewMultiReaderAt(parts ...SizeReaderAt) SizeReaderAt {
+	m := &multiRA{
+		parts: make([]offsetAndSource, 0, len(parts)),
+	}
+	var off int64
+	for _, p := range parts {
+		m.parts = append(m.parts, offsetAndSource{off, p})
+		off += p.Size()
+	}
+	m.size = off
+	return m
+}
+
+type offsetAndSource struct {
+	off int64
+	SizeReaderAt
+}
+
+type multiRA struct {
+	parts []offsetAndSource
+	size  int64
+}
+
+func (m *multiRA) Size() int64 { return m.size }
+
+func (m *multiRA) ReadAt(p []byte, off int64) (n int, err error) {
+	wantN := len(p)
+
+	// Skip past the requested offset.
+	skipParts := sort.Search(len(m.parts), func(i int) bool {
+		// This function returns whether parts[i] will
+		// contribute any bytes to our output.
+		part := m.parts[i]
+		return part.off+part.Size() > off
+	})
+	parts := m.parts[skipParts:]
+
+	// How far to skip in the first part.
+	needSkip := off
+	if len(parts) > 0 {
+		needSkip -= parts[0].off
+	}
+
+	for len(parts) > 0 && len(p) > 0 {
+		readP := p
+		partSize := parts[0].Size()
+		if int64(len(readP)) > partSize-needSkip {
+			readP = readP[:partSize-needSkip]
+		}
+		pn, err0 := parts[0].ReadAt(readP, needSkip)
+		if err0 != nil {
+			return n, err0
+		}
+		n += pn
+		p = p[pn:]
+		if int64(pn)+needSkip == partSize {
+			parts = parts[1:]
+		}
+		needSkip = 0
+	}
+
+	if n != wantN {
+		err = io.ErrUnexpectedEOF
+	}
+	return
+}
diff --git a/vendor/go4.org/readerutil/readersize.go b/vendor/go4.org/readerutil/readersize.go
new file mode 100644
index 0000000000..7e2aa18f48
--- /dev/null
+++ b/vendor/go4.org/readerutil/readersize.go
@@ -0,0 +1,58 @@
+/*
+Copyright 2012 The Go4 Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package readerutil provides and operates on io.Readers.
+package readerutil // import "go4.org/readerutil"
+
+import (
+	"bytes"
+	"io"
+	"os"
+	"strings"
+)
+
+// Size tries to determine the length of r. If r is an io.Seeker, Size may seek
+// to guess the length.
+func Size(r io.Reader) (size int64, ok bool) {
+	switch rt := r.(type) {
+	case *bytes.Buffer:
+		return int64(rt.Len()), true
+	case *bytes.Reader:
+		return int64(rt.Len()), true
+	case *strings.Reader:
+		return int64(rt.Len()), true
+	case io.Seeker:
+		pos, err := rt.Seek(0, os.SEEK_CUR)
+		if err != nil {
+			return
+		}
+		end, err := rt.Seek(0, os.SEEK_END)
+		if err != nil {
+			return
+		}
+		size = end - pos
+		pos1, err := rt.Seek(pos, os.SEEK_SET)
+		if err != nil || pos1 != pos {
+			msg := "failed to restore seek position"
+			if err != nil {
+				msg += ": " + err.Error()
+			}
+			panic(msg)
+		}
+		return size, true
+	}
+	return 0, false
+}
diff --git a/vendor/go4.org/readerutil/readerutil.go b/vendor/go4.org/readerutil/readerutil.go
new file mode 100644
index 0000000000..7bdb8e1a5a
--- /dev/null
+++ b/vendor/go4.org/readerutil/readerutil.go
@@ -0,0 +1,84 @@
+/*
+Copyright 2016 The go4 Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package readerutil contains io.Reader types.
+package readerutil // import "go4.org/readerutil"
+
+import (
+	"expvar"
+	"io"
+)
+
+// A SizeReaderAt is a ReaderAt with a Size method.
+//
+// An io.SectionReader implements SizeReaderAt.
+type SizeReaderAt interface {
+	Size() int64
+	io.ReaderAt
+}
+
+// A ReadSeekCloser can Read, Seek, and Close.
+type ReadSeekCloser interface {
+	io.Reader
+	io.Seeker
+	io.Closer
+}
+
+type ReaderAtCloser interface {
+	io.ReaderAt
+	io.Closer
+}
+
+// TODO(wathiede): make sure all the stat readers work with code that
+// type asserts ReadFrom/WriteTo.
+
+type varStatReader struct {
+	*expvar.Int
+	r io.Reader
+}
+
+// NewStatsReader returns an io.Reader that will have the number of bytes
+// read from r added to v.
+func NewStatsReader(v *expvar.Int, r io.Reader) io.Reader {
+	return &varStatReader{v, r}
+}
+
+func (v *varStatReader) Read(p []byte) (int, error) {
+	n, err := v.r.Read(p)
+	v.Int.Add(int64(n))
+	return n, err
+}
+
+type varStatReadSeeker struct {
+	*expvar.Int
+	rs io.ReadSeeker
+}
+
+// NewStatsReadSeeker returns an io.ReadSeeker that will have the number of bytes
+// read from rs added to v.
+func NewStatsReadSeeker(v *expvar.Int, rs io.ReadSeeker) io.ReadSeeker {
+	return &varStatReadSeeker{v, rs}
+}
+
+func (v *varStatReadSeeker) Read(p []byte) (int, error) {
+	n, err := v.rs.Read(p)
+	v.Int.Add(int64(n))
+	return n, err
+}
+
+func (v *varStatReadSeeker) Seek(offset int64, whence int) (int64, error) {
+	return v.rs.Seek(offset, whence)
+}
diff --git a/vendor/go4.org/syncutil/gate.go b/vendor/go4.org/syncutil/gate.go
new file mode 100644
index 0000000000..e4592be955
--- /dev/null
+++ b/vendor/go4.org/syncutil/gate.go
@@ -0,0 +1,41 @@
+/*
+Copyright 2013 Google Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package syncutil
+
+// A Gate limits concurrency.
+type Gate struct {
+	c chan struct{}
+}
+
+// NewGate returns a new gate that will only permit max operations at once.
+func NewGate(max int) *Gate {
+	return &Gate{make(chan struct{}, max)}
+}
+
+// Start starts an operation, blocking until the gate has room.
+func (g *Gate) Start() {
+	g.c <- struct{}{}
+}
+
+// Done finishes an operation.
+func (g *Gate) Done() {
+	select {
+	case <-g.c:
+	default:
+		panic("Done called more than Start")
+	}
+}
diff --git a/vendor/go4.org/syncutil/group.go b/vendor/go4.org/syncutil/group.go
new file mode 100644
index 0000000000..dacef4c484
--- /dev/null
+++ b/vendor/go4.org/syncutil/group.go
@@ -0,0 +1,64 @@
+/*
+Copyright 2013 Google Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package syncutil
+
+import "sync"
+
+// A Group is like a sync.WaitGroup and coordinates doing
+// multiple things at once. Its zero value is ready to use.
+type Group struct {
+	wg   sync.WaitGroup
+	mu   sync.Mutex // guards errs
+	errs []error
+}
+
+// Go runs fn in its own goroutine, but does not wait for it to complete.
+// Call Err or Errs to wait for all the goroutines to complete.
+func (g *Group) Go(fn func() error) {
+	g.wg.Add(1)
+	go func() {
+		defer g.wg.Done()
+		err := fn()
+		if err != nil {
+			g.mu.Lock()
+			defer g.mu.Unlock()
+			g.errs = append(g.errs, err)
+		}
+	}()
+}
+
+// Wait waits for all the previous calls to Go to complete.
+func (g *Group) Wait() {
+	g.wg.Wait()
+}
+
+// Err waits for all previous calls to Go to complete and returns the
+// first non-nil error, or nil.
+func (g *Group) Err() error {
+	g.wg.Wait()
+	if len(g.errs) > 0 {
+		return g.errs[0]
+	}
+	return nil
+}
+
+// Errs waits for all previous calls to Go to complete and returns
+// all non-nil errors.
+func (g *Group) Errs() []error {
+	g.wg.Wait()
+	return g.errs
+}
diff --git a/vendor/go4.org/syncutil/once.go b/vendor/go4.org/syncutil/once.go
new file mode 100644
index 0000000000..cd276cec8b
--- /dev/null
+++ b/vendor/go4.org/syncutil/once.go
@@ -0,0 +1,60 @@
+/*
+Copyright 2014 The Perkeep Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package syncutil
+
+import (
+	"sync"
+	"sync/atomic"
+)
+
+// A Once will perform a successful action exactly once.
+//
+// Unlike a sync.Once, this Once's func returns an error
+// and is re-armed on failure.
+type Once struct {
+	m    sync.Mutex
+	done uint32
+}
+
+// Do calls the function f if and only if Do has not been invoked
+// without error for this instance of Once.  In other words, given
+// 	var once Once
+// if once.Do(f) is called multiple times, only the first call will
+// invoke f, even if f has a different value in each invocation unless
+// f returns an error.  A new instance of Once is required for each
+// function to execute.
+//
+// Do is intended for initialization that must be run exactly once.  Since f
+// is niladic, it may be necessary to use a function literal to capture the
+// arguments to a function to be invoked by Do:
+// 	err := config.once.Do(func() error { return config.init(filename) })
+func (o *Once) Do(f func() error) error {
+	if atomic.LoadUint32(&o.done) == 1 {
+		return nil
+	}
+	// Slow-path.
+	o.m.Lock()
+	defer o.m.Unlock()
+	var err error
+	if o.done == 0 {
+		err = f()
+		if err == nil {
+			atomic.StoreUint32(&o.done, 1)
+		}
+	}
+	return err
+}
diff --git a/vendor/go4.org/syncutil/sem.go b/vendor/go4.org/syncutil/sem.go
new file mode 100644
index 0000000000..092655ff5a
--- /dev/null
+++ b/vendor/go4.org/syncutil/sem.go
@@ -0,0 +1,64 @@
+package syncutil
+
+import (
+	"fmt"
+	"log"
+	"sync"
+)
+
+type debugT bool
+
+var debug = debugT(false)
+
+func (d debugT) Printf(format string, args ...interface{}) {
+	if bool(d) {
+		log.Printf(format, args...)
+	}
+}
+
+// Sem implements a semaphore that can have multiple units acquired/released
+// at a time.
+type Sem struct {
+	c         *sync.Cond // Protects size
+	max, free int64
+}
+
+// NewSem creates a semaphore with max units available for acquisition.
+func NewSem(max int64) *Sem {
+	return &Sem{
+		c:    sync.NewCond(new(sync.Mutex)),
+		free: max,
+		max:  max,
+	}
+}
+
+// Acquire will deduct n units from the semaphore.  If the deduction would
+// result in the available units falling below zero, the call will block until
+// another go routine returns units via a call to Release.  If more units are
+// requested than the semaphore is configured to hold, error will be non-nil.
+func (s *Sem) Acquire(n int64) error {
+	if n > s.max {
+		return fmt.Errorf("sem: attempt to acquire more units than semaphore size %d > %d", n, s.max)
+	}
+	s.c.L.Lock()
+	defer s.c.L.Unlock()
+	for {
+		debug.Printf("Acquire check max %d free %d, n %d", s.max, s.free, n)
+		if s.free >= n {
+			s.free -= n
+			return nil
+		}
+		debug.Printf("Acquire Wait max %d free %d, n %d", s.max, s.free, n)
+		s.c.Wait()
+	}
+}
+
+// Release will return n units to the semaphore and notify any currently
+// blocking Acquire calls.
+func (s *Sem) Release(n int64) {
+	s.c.L.Lock()
+	defer s.c.L.Unlock()
+	debug.Printf("Release max %d free %d, n %d", s.max, s.free, n)
+	s.free += n
+	s.c.Broadcast()
+}
diff --git a/vendor/go4.org/syncutil/syncutil.go b/vendor/go4.org/syncutil/syncutil.go
new file mode 100644
index 0000000000..851aebd2b3
--- /dev/null
+++ b/vendor/go4.org/syncutil/syncutil.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2014 The Perkeep Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package syncutil provides various synchronization utilities.
+package syncutil // import "go4.org/syncutil"
diff --git a/vendor/golang.org/x/text/encoding/encoding.go b/vendor/golang.org/x/text/encoding/encoding.go
new file mode 100644
index 0000000000..a0bd7cd4d0
--- /dev/null
+++ b/vendor/golang.org/x/text/encoding/encoding.go
@@ -0,0 +1,335 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package encoding defines an interface for character encodings, such as Shift
+// JIS and Windows 1252, that can convert to and from UTF-8.
+//
+// Encoding implementations are provided in other packages, such as
+// golang.org/x/text/encoding/charmap and
+// golang.org/x/text/encoding/japanese.
+package encoding // import "golang.org/x/text/encoding"
+
+import (
+	"errors"
+	"io"
+	"strconv"
+	"unicode/utf8"
+
+	"golang.org/x/text/encoding/internal/identifier"
+	"golang.org/x/text/transform"
+)
+
+// TODO:
+// - There seems to be some inconsistency in when decoders return errors
+//   and when not. Also documentation seems to suggest they shouldn't return
+//   errors at all (except for UTF-16).
+// - Encoders seem to rely on or at least benefit from the input being in NFC
+//   normal form. Perhaps add an example how users could prepare their output.
+
+// Encoding is a character set encoding that can be transformed to and from
+// UTF-8.
+type Encoding interface {
+	// NewDecoder returns a Decoder.
+	NewDecoder() *Decoder
+
+	// NewEncoder returns an Encoder.
+	NewEncoder() *Encoder
+}
+
+// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
+//
+// Transforming source bytes that are not of that encoding will not result in an
+// error per se. Each byte that cannot be transcoded will be represented in the
+// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
+type Decoder struct {
+	transform.Transformer
+
+	// This forces external creators of Decoders to use names in struct
+	// initializers, allowing for future extendibility without having to break
+	// code.
+	_ struct{}
+}
+
+// Bytes converts the given encoded bytes to UTF-8. It returns the converted
+// bytes or nil, err if any error occurred.
+func (d *Decoder) Bytes(b []byte) ([]byte, error) {
+	b, _, err := transform.Bytes(d, b)
+	if err != nil {
+		return nil, err
+	}
+	return b, nil
+}
+
+// String converts the given encoded string to UTF-8. It returns the converted
+// string or "", err if any error occurred.
+func (d *Decoder) String(s string) (string, error) {
+	s, _, err := transform.String(d, s)
+	if err != nil {
+		return "", err
+	}
+	return s, nil
+}
+
+// Reader wraps another Reader to decode its bytes.
+//
+// The Decoder may not be used for any other operation as long as the returned
+// Reader is in use.
+func (d *Decoder) Reader(r io.Reader) io.Reader {
+	return transform.NewReader(r, d)
+}
+
+// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
+//
+// Each rune that cannot be transcoded will result in an error. In this case,
+// the transform will consume all source byte up to, not including the offending
+// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
+// `\uFFFD`. To return early with an error instead, use transform.Chain to
+// preprocess the data with a UTF8Validator.
+type Encoder struct {
+	transform.Transformer
+
+	// This forces external creators of Encoders to use names in struct
+	// initializers, allowing for future extendibility without having to break
+	// code.
+	_ struct{}
+}
+
+// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
+// any error occurred.
+func (e *Encoder) Bytes(b []byte) ([]byte, error) {
+	b, _, err := transform.Bytes(e, b)
+	if err != nil {
+		return nil, err
+	}
+	return b, nil
+}
+
+// String converts a string from UTF-8. It returns the converted string or
+// "", err if any error occurred.
+func (e *Encoder) String(s string) (string, error) {
+	s, _, err := transform.String(e, s)
+	if err != nil {
+		return "", err
+	}
+	return s, nil
+}
+
+// Writer wraps another Writer to encode its UTF-8 output.
+//
+// The Encoder may not be used for any other operation as long as the returned
+// Writer is in use.
+func (e *Encoder) Writer(w io.Writer) io.Writer {
+	return transform.NewWriter(w, e)
+}
+
+// ASCIISub is the ASCII substitute character, as recommended by
+// https://unicode.org/reports/tr36/#Text_Comparison
+const ASCIISub = '\x1a'
+
+// Nop is the nop encoding. Its transformed bytes are the same as the source
+// bytes; it does not replace invalid UTF-8 sequences.
+var Nop Encoding = nop{}
+
+type nop struct{}
+
+func (nop) NewDecoder() *Decoder {
+	return &Decoder{Transformer: transform.Nop}
+}
+func (nop) NewEncoder() *Encoder {
+	return &Encoder{Transformer: transform.Nop}
+}
+
+// Replacement is the replacement encoding. Decoding from the replacement
+// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
+// the replacement encoding yields the same as the source bytes except that
+// invalid UTF-8 is converted to '\uFFFD'.
+//
+// It is defined at http://encoding.spec.whatwg.org/#replacement
+var Replacement Encoding = replacement{}
+
+type replacement struct{}
+
+func (replacement) NewDecoder() *Decoder {
+	return &Decoder{Transformer: replacementDecoder{}}
+}
+
+func (replacement) NewEncoder() *Encoder {
+	return &Encoder{Transformer: replacementEncoder{}}
+}
+
+func (replacement) ID() (mib identifier.MIB, other string) {
+	return identifier.Replacement, ""
+}
+
+type replacementDecoder struct{ transform.NopResetter }
+
+func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	if len(dst) < 3 {
+		return 0, 0, transform.ErrShortDst
+	}
+	if atEOF {
+		const fffd = "\ufffd"
+		dst[0] = fffd[0]
+		dst[1] = fffd[1]
+		dst[2] = fffd[2]
+		nDst = 3
+	}
+	return nDst, len(src), nil
+}
+
+type replacementEncoder struct{ transform.NopResetter }
+
+func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	r, size := rune(0), 0
+
+	for ; nSrc < len(src); nSrc += size {
+		r = rune(src[nSrc])
+
+		// Decode a 1-byte rune.
+		if r < utf8.RuneSelf {
+			size = 1
+
+		} else {
+			// Decode a multi-byte rune.
+			r, size = utf8.DecodeRune(src[nSrc:])
+			if size == 1 {
+				// All valid runes of size 1 (those below utf8.RuneSelf) were
+				// handled above. We have invalid UTF-8 or we haven't seen the
+				// full character yet.
+				if !atEOF && !utf8.FullRune(src[nSrc:]) {
+					err = transform.ErrShortSrc
+					break
+				}
+				r = '\ufffd'
+			}
+		}
+
+		if nDst+utf8.RuneLen(r) > len(dst) {
+			err = transform.ErrShortDst
+			break
+		}
+		nDst += utf8.EncodeRune(dst[nDst:], r)
+	}
+	return nDst, nSrc, err
+}
+
+// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
+// repertoire of the destination encoding with HTML escape sequences.
+//
+// This wrapper exists to comply to URL and HTML forms requiring a
+// non-terminating legacy encoder. The produced sequences may lead to data
+// loss as they are indistinguishable from legitimate input. To avoid this
+// issue, use UTF-8 encodings whenever possible.
+func HTMLEscapeUnsupported(e *Encoder) *Encoder {
+	return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
+}
+
+// ReplaceUnsupported wraps encoders to replace source runes outside the
+// repertoire of the destination encoding with an encoding-specific
+// replacement.
+//
+// This wrapper is only provided for backwards compatibility and legacy
+// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
+func ReplaceUnsupported(e *Encoder) *Encoder {
+	return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
+}
+
+type errorHandler struct {
+	*Encoder
+	handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
+}
+
+// TODO: consider making this error public in some form.
+type repertoireError interface {
+	Replacement() byte
+}
+
+func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
+	for err != nil {
+		rerr, ok := err.(repertoireError)
+		if !ok {
+			return nDst, nSrc, err
+		}
+		r, sz := utf8.DecodeRune(src[nSrc:])
+		n, ok := h.handler(dst[nDst:], r, rerr)
+		if !ok {
+			return nDst, nSrc, transform.ErrShortDst
+		}
+		err = nil
+		nDst += n
+		if nSrc += sz; nSrc < len(src) {
+			var dn, sn int
+			dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
+			nDst += dn
+			nSrc += sn
+		}
+	}
+	return nDst, nSrc, err
+}
+
+func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
+	buf := [8]byte{}
+	b := strconv.AppendUint(buf[:0], uint64(r), 10)
+	if n = len(b) + len("&#;"); n >= len(dst) {
+		return 0, false
+	}
+	dst[0] = '&'
+	dst[1] = '#'
+	dst[copy(dst[2:], b)+2] = ';'
+	return n, true
+}
+
+func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
+	if len(dst) == 0 {
+		return 0, false
+	}
+	dst[0] = err.Replacement()
+	return 1, true
+}
+
+// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
+var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
+
+// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
+// input byte that is not valid UTF-8.
+var UTF8Validator transform.Transformer = utf8Validator{}
+
+type utf8Validator struct{ transform.NopResetter }
+
+func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	n := len(src)
+	if n > len(dst) {
+		n = len(dst)
+	}
+	for i := 0; i < n; {
+		if c := src[i]; c < utf8.RuneSelf {
+			dst[i] = c
+			i++
+			continue
+		}
+		_, size := utf8.DecodeRune(src[i:])
+		if size == 1 {
+			// All valid runes of size 1 (those below utf8.RuneSelf) were
+			// handled above. We have invalid UTF-8 or we haven't seen the
+			// full character yet.
+			err = ErrInvalidUTF8
+			if !atEOF && !utf8.FullRune(src[i:]) {
+				err = transform.ErrShortSrc
+			}
+			return i, i, err
+		}
+		if i+size > len(dst) {
+			return i, i, transform.ErrShortDst
+		}
+		for ; size > 0; size-- {
+			dst[i] = src[i]
+			i++
+		}
+	}
+	if len(src) > len(dst) {
+		err = transform.ErrShortDst
+	}
+	return n, n, err
+}
diff --git a/vendor/golang.org/x/text/encoding/internal/identifier/identifier.go b/vendor/golang.org/x/text/encoding/internal/identifier/identifier.go
new file mode 100644
index 0000000000..5c9b85c280
--- /dev/null
+++ b/vendor/golang.org/x/text/encoding/internal/identifier/identifier.go
@@ -0,0 +1,81 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate go run gen.go
+
+// Package identifier defines the contract between implementations of Encoding
+// and Index by defining identifiers that uniquely identify standardized coded
+// character sets (CCS) and character encoding schemes (CES), which we will
+// together refer to as encodings, for which Encoding implementations provide
+// converters to and from UTF-8. This package is typically only of concern to
+// implementers of Indexes and Encodings.
+//
+// One part of the identifier is the MIB code, which is defined by IANA and
+// uniquely identifies a CCS or CES. Each code is associated with data that
+// references authorities, official documentation as well as aliases and MIME
+// names.
+//
+// Not all CESs are covered by the IANA registry. The "other" string that is
+// returned by ID can be used to identify other character sets or versions of
+// existing ones.
+//
+// It is recommended that each package that provides a set of Encodings provide
+// the All and Common variables to reference all supported encodings and
+// commonly used subset. This allows Index implementations to include all
+// available encodings without explicitly referencing or knowing about them.
+package identifier
+
+// Note: this package is internal, but could be made public if there is a need
+// for writing third-party Indexes and Encodings.
+
+// References:
+// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt
+// - http://www.iana.org/assignments/character-sets/character-sets.xhtml
+// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib
+// - http://www.ietf.org/rfc/rfc2978.txt
+// - https://www.unicode.org/reports/tr22/
+// - http://www.w3.org/TR/encoding/
+// - https://encoding.spec.whatwg.org/
+// - https://encoding.spec.whatwg.org/encodings.json
+// - https://tools.ietf.org/html/rfc6657#section-5
+
+// Interface can be implemented by Encodings to define the CCS or CES for which
+// it implements conversions.
+type Interface interface {
+	// ID returns an encoding identifier. Exactly one of the mib and other
+	// values should be non-zero.
+	//
+	// In the usual case it is only necessary to indicate the MIB code. The
+	// other string can be used to specify encodings for which there is no MIB,
+	// such as "x-mac-dingbat".
+	//
+	// The other string may only contain the characters a-z, A-Z, 0-9, - and _.
+	ID() (mib MIB, other string)
+
+	// NOTE: the restrictions on the encoding are to allow extending the syntax
+	// with additional information such as versions, vendors and other variants.
+}
+
+// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds
+// some identifiers for some encodings that are not covered by the IANA
+// standard.
+//
+// See http://www.iana.org/assignments/ianacharset-mib.
+type MIB uint16
+
+// These additional MIB types are not defined in IANA. They are added because
+// they are common and defined within the text repo.
+const (
+	// Unofficial marks the start of encodings not registered by IANA.
+	Unofficial MIB = 10000 + iota
+
+	// Replacement is the WhatWG replacement encoding.
+	Replacement
+
+	// XUserDefined is the code for x-user-defined.
+	XUserDefined
+
+	// MacintoshCyrillic is the code for x-mac-cyrillic.
+	MacintoshCyrillic
+)
diff --git a/vendor/golang.org/x/text/encoding/internal/identifier/mib.go b/vendor/golang.org/x/text/encoding/internal/identifier/mib.go
new file mode 100644
index 0000000000..351fb86e29
--- /dev/null
+++ b/vendor/golang.org/x/text/encoding/internal/identifier/mib.go
@@ -0,0 +1,1627 @@
+// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
+
+package identifier
+
+const (
+	// ASCII is the MIB identifier with IANA name US-ASCII (MIME: US-ASCII).
+	//
+	// ANSI X3.4-1986
+	// Reference: RFC2046
+	ASCII MIB = 3
+
+	// ISOLatin1 is the MIB identifier with IANA name ISO_8859-1:1987 (MIME: ISO-8859-1).
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISOLatin1 MIB = 4
+
+	// ISOLatin2 is the MIB identifier with IANA name ISO_8859-2:1987 (MIME: ISO-8859-2).
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISOLatin2 MIB = 5
+
+	// ISOLatin3 is the MIB identifier with IANA name ISO_8859-3:1988 (MIME: ISO-8859-3).
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISOLatin3 MIB = 6
+
+	// ISOLatin4 is the MIB identifier with IANA name ISO_8859-4:1988 (MIME: ISO-8859-4).
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISOLatin4 MIB = 7
+
+	// ISOLatinCyrillic is the MIB identifier with IANA name ISO_8859-5:1988 (MIME: ISO-8859-5).
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISOLatinCyrillic MIB = 8
+
+	// ISOLatinArabic is the MIB identifier with IANA name ISO_8859-6:1987 (MIME: ISO-8859-6).
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISOLatinArabic MIB = 9
+
+	// ISOLatinGreek is the MIB identifier with IANA name ISO_8859-7:1987 (MIME: ISO-8859-7).
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1947
+	// Reference: RFC1345
+	ISOLatinGreek MIB = 10
+
+	// ISOLatinHebrew is the MIB identifier with IANA name ISO_8859-8:1988 (MIME: ISO-8859-8).
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISOLatinHebrew MIB = 11
+
+	// ISOLatin5 is the MIB identifier with IANA name ISO_8859-9:1989 (MIME: ISO-8859-9).
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISOLatin5 MIB = 12
+
+	// ISOLatin6 is the MIB identifier with IANA name ISO-8859-10 (MIME: ISO-8859-10).
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISOLatin6 MIB = 13
+
+	// ISOTextComm is the MIB identifier with IANA name ISO_6937-2-add.
+	//
+	// ISO-IR: International Register of Escape Sequences and ISO 6937-2:1983
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISOTextComm MIB = 14
+
+	// HalfWidthKatakana is the MIB identifier with IANA name JIS_X0201.
+	//
+	// JIS X 0201-1976.   One byte only, this is equivalent to
+	// JIS/Roman (similar to ASCII) plus eight-bit half-width
+	// Katakana
+	// Reference: RFC1345
+	HalfWidthKatakana MIB = 15
+
+	// JISEncoding is the MIB identifier with IANA name JIS_Encoding.
+	//
+	// JIS X 0202-1991.  Uses ISO 2022 escape sequences to
+	// shift code sets as documented in JIS X 0202-1991.
+	JISEncoding MIB = 16
+
+	// ShiftJIS is the MIB identifier with IANA name Shift_JIS (MIME: Shift_JIS).
+	//
+	// This charset is an extension of csHalfWidthKatakana by
+	// adding graphic characters in JIS X 0208.  The CCS's are
+	// JIS X0201:1997 and JIS X0208:1997.  The
+	// complete definition is shown in Appendix 1 of JIS
+	// X0208:1997.
+	// This charset can be used for the top-level media type "text".
+	ShiftJIS MIB = 17
+
+	// EUCPkdFmtJapanese is the MIB identifier with IANA name Extended_UNIX_Code_Packed_Format_for_Japanese (MIME: EUC-JP).
+	//
+	// Standardized by OSF, UNIX International, and UNIX Systems
+	// Laboratories Pacific.  Uses ISO 2022 rules to select
+	// code set 0: US-ASCII (a single 7-bit byte set)
+	// code set 1: JIS X0208-1990 (a double 8-bit byte set)
+	// restricted to A0-FF in both bytes
+	// code set 2: Half Width Katakana (a single 7-bit byte set)
+	// requiring SS2 as the character prefix
+	// code set 3: JIS X0212-1990 (a double 7-bit byte set)
+	// restricted to A0-FF in both bytes
+	// requiring SS3 as the character prefix
+	EUCPkdFmtJapanese MIB = 18
+
+	// EUCFixWidJapanese is the MIB identifier with IANA name Extended_UNIX_Code_Fixed_Width_for_Japanese.
+	//
+	// Used in Japan.  Each character is 2 octets.
+	// code set 0: US-ASCII (a single 7-bit byte set)
+	// 1st byte = 00
+	// 2nd byte = 20-7E
+	// code set 1: JIS X0208-1990 (a double 7-bit byte set)
+	// restricted  to A0-FF in both bytes
+	// code set 2: Half Width Katakana (a single 7-bit byte set)
+	// 1st byte = 00
+	// 2nd byte = A0-FF
+	// code set 3: JIS X0212-1990 (a double 7-bit byte set)
+	// restricted to A0-FF in
+	// the first byte
+	// and 21-7E in the second byte
+	EUCFixWidJapanese MIB = 19
+
+	// ISO4UnitedKingdom is the MIB identifier with IANA name BS_4730.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO4UnitedKingdom MIB = 20
+
+	// ISO11SwedishForNames is the MIB identifier with IANA name SEN_850200_C.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO11SwedishForNames MIB = 21
+
+	// ISO15Italian is the MIB identifier with IANA name IT.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO15Italian MIB = 22
+
+	// ISO17Spanish is the MIB identifier with IANA name ES.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO17Spanish MIB = 23
+
+	// ISO21German is the MIB identifier with IANA name DIN_66003.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO21German MIB = 24
+
+	// ISO60Norwegian1 is the MIB identifier with IANA name NS_4551-1.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO60Norwegian1 MIB = 25
+
+	// ISO69French is the MIB identifier with IANA name NF_Z_62-010.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO69French MIB = 26
+
+	// ISO10646UTF1 is the MIB identifier with IANA name ISO-10646-UTF-1.
+	//
+	// Universal Transfer Format (1), this is the multibyte
+	// encoding, that subsets ASCII-7. It does not have byte
+	// ordering issues.
+	ISO10646UTF1 MIB = 27
+
+	// ISO646basic1983 is the MIB identifier with IANA name ISO_646.basic:1983.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO646basic1983 MIB = 28
+
+	// INVARIANT is the MIB identifier with IANA name INVARIANT.
+	//
+	// Reference: RFC1345
+	INVARIANT MIB = 29
+
+	// ISO2IntlRefVersion is the MIB identifier with IANA name ISO_646.irv:1983.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO2IntlRefVersion MIB = 30
+
+	// NATSSEFI is the MIB identifier with IANA name NATS-SEFI.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	NATSSEFI MIB = 31
+
+	// NATSSEFIADD is the MIB identifier with IANA name NATS-SEFI-ADD.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	NATSSEFIADD MIB = 32
+
+	// NATSDANO is the MIB identifier with IANA name NATS-DANO.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	NATSDANO MIB = 33
+
+	// NATSDANOADD is the MIB identifier with IANA name NATS-DANO-ADD.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	NATSDANOADD MIB = 34
+
+	// ISO10Swedish is the MIB identifier with IANA name SEN_850200_B.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO10Swedish MIB = 35
+
+	// KSC56011987 is the MIB identifier with IANA name KS_C_5601-1987.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	KSC56011987 MIB = 36
+
+	// ISO2022KR is the MIB identifier with IANA name ISO-2022-KR (MIME: ISO-2022-KR).
+	//
+	// rfc1557 (see also KS_C_5601-1987)
+	// Reference: RFC1557
+	ISO2022KR MIB = 37
+
+	// EUCKR is the MIB identifier with IANA name EUC-KR (MIME: EUC-KR).
+	//
+	// rfc1557 (see also KS_C_5861-1992)
+	// Reference: RFC1557
+	EUCKR MIB = 38
+
+	// ISO2022JP is the MIB identifier with IANA name ISO-2022-JP (MIME: ISO-2022-JP).
+	//
+	// rfc1468 (see also rfc2237 )
+	// Reference: RFC1468
+	ISO2022JP MIB = 39
+
+	// ISO2022JP2 is the MIB identifier with IANA name ISO-2022-JP-2 (MIME: ISO-2022-JP-2).
+	//
+	// rfc1554
+	// Reference: RFC1554
+	ISO2022JP2 MIB = 40
+
+	// ISO13JISC6220jp is the MIB identifier with IANA name JIS_C6220-1969-jp.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO13JISC6220jp MIB = 41
+
+	// ISO14JISC6220ro is the MIB identifier with IANA name JIS_C6220-1969-ro.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO14JISC6220ro MIB = 42
+
+	// ISO16Portuguese is the MIB identifier with IANA name PT.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO16Portuguese MIB = 43
+
+	// ISO18Greek7Old is the MIB identifier with IANA name greek7-old.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO18Greek7Old MIB = 44
+
+	// ISO19LatinGreek is the MIB identifier with IANA name latin-greek.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO19LatinGreek MIB = 45
+
+	// ISO25French is the MIB identifier with IANA name NF_Z_62-010_(1973).
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO25French MIB = 46
+
+	// ISO27LatinGreek1 is the MIB identifier with IANA name Latin-greek-1.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO27LatinGreek1 MIB = 47
+
+	// ISO5427Cyrillic is the MIB identifier with IANA name ISO_5427.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO5427Cyrillic MIB = 48
+
+	// ISO42JISC62261978 is the MIB identifier with IANA name JIS_C6226-1978.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO42JISC62261978 MIB = 49
+
+	// ISO47BSViewdata is the MIB identifier with IANA name BS_viewdata.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO47BSViewdata MIB = 50
+
+	// ISO49INIS is the MIB identifier with IANA name INIS.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO49INIS MIB = 51
+
+	// ISO50INIS8 is the MIB identifier with IANA name INIS-8.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO50INIS8 MIB = 52
+
+	// ISO51INISCyrillic is the MIB identifier with IANA name INIS-cyrillic.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO51INISCyrillic MIB = 53
+
+	// ISO54271981 is the MIB identifier with IANA name ISO_5427:1981.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO54271981 MIB = 54
+
+	// ISO5428Greek is the MIB identifier with IANA name ISO_5428:1980.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO5428Greek MIB = 55
+
+	// ISO57GB1988 is the MIB identifier with IANA name GB_1988-80.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO57GB1988 MIB = 56
+
+	// ISO58GB231280 is the MIB identifier with IANA name GB_2312-80.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO58GB231280 MIB = 57
+
+	// ISO61Norwegian2 is the MIB identifier with IANA name NS_4551-2.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO61Norwegian2 MIB = 58
+
+	// ISO70VideotexSupp1 is the MIB identifier with IANA name videotex-suppl.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO70VideotexSupp1 MIB = 59
+
+	// ISO84Portuguese2 is the MIB identifier with IANA name PT2.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO84Portuguese2 MIB = 60
+
+	// ISO85Spanish2 is the MIB identifier with IANA name ES2.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO85Spanish2 MIB = 61
+
+	// ISO86Hungarian is the MIB identifier with IANA name MSZ_7795.3.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO86Hungarian MIB = 62
+
+	// ISO87JISX0208 is the MIB identifier with IANA name JIS_C6226-1983.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO87JISX0208 MIB = 63
+
+	// ISO88Greek7 is the MIB identifier with IANA name greek7.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO88Greek7 MIB = 64
+
+	// ISO89ASMO449 is the MIB identifier with IANA name ASMO_449.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO89ASMO449 MIB = 65
+
+	// ISO90 is the MIB identifier with IANA name iso-ir-90.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO90 MIB = 66
+
+	// ISO91JISC62291984a is the MIB identifier with IANA name JIS_C6229-1984-a.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO91JISC62291984a MIB = 67
+
+	// ISO92JISC62991984b is the MIB identifier with IANA name JIS_C6229-1984-b.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO92JISC62991984b MIB = 68
+
+	// ISO93JIS62291984badd is the MIB identifier with IANA name JIS_C6229-1984-b-add.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO93JIS62291984badd MIB = 69
+
+	// ISO94JIS62291984hand is the MIB identifier with IANA name JIS_C6229-1984-hand.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO94JIS62291984hand MIB = 70
+
+	// ISO95JIS62291984handadd is the MIB identifier with IANA name JIS_C6229-1984-hand-add.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO95JIS62291984handadd MIB = 71
+
+	// ISO96JISC62291984kana is the MIB identifier with IANA name JIS_C6229-1984-kana.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO96JISC62291984kana MIB = 72
+
+	// ISO2033 is the MIB identifier with IANA name ISO_2033-1983.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO2033 MIB = 73
+
+	// ISO99NAPLPS is the MIB identifier with IANA name ANSI_X3.110-1983.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO99NAPLPS MIB = 74
+
+	// ISO102T617bit is the MIB identifier with IANA name T.61-7bit.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO102T617bit MIB = 75
+
+	// ISO103T618bit is the MIB identifier with IANA name T.61-8bit.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO103T618bit MIB = 76
+
+	// ISO111ECMACyrillic is the MIB identifier with IANA name ECMA-cyrillic.
+	//
+	// ISO registry
+	ISO111ECMACyrillic MIB = 77
+
+	// ISO121Canadian1 is the MIB identifier with IANA name CSA_Z243.4-1985-1.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO121Canadian1 MIB = 78
+
+	// ISO122Canadian2 is the MIB identifier with IANA name CSA_Z243.4-1985-2.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO122Canadian2 MIB = 79
+
+	// ISO123CSAZ24341985gr is the MIB identifier with IANA name CSA_Z243.4-1985-gr.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO123CSAZ24341985gr MIB = 80
+
+	// ISO88596E is the MIB identifier with IANA name ISO_8859-6-E (MIME: ISO-8859-6-E).
+	//
+	// rfc1556
+	// Reference: RFC1556
+	ISO88596E MIB = 81
+
+	// ISO88596I is the MIB identifier with IANA name ISO_8859-6-I (MIME: ISO-8859-6-I).
+	//
+	// rfc1556
+	// Reference: RFC1556
+	ISO88596I MIB = 82
+
+	// ISO128T101G2 is the MIB identifier with IANA name T.101-G2.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO128T101G2 MIB = 83
+
+	// ISO88598E is the MIB identifier with IANA name ISO_8859-8-E (MIME: ISO-8859-8-E).
+	//
+	// rfc1556
+	// Reference: RFC1556
+	ISO88598E MIB = 84
+
+	// ISO88598I is the MIB identifier with IANA name ISO_8859-8-I (MIME: ISO-8859-8-I).
+	//
+	// rfc1556
+	// Reference: RFC1556
+	ISO88598I MIB = 85
+
+	// ISO139CSN369103 is the MIB identifier with IANA name CSN_369103.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO139CSN369103 MIB = 86
+
+	// ISO141JUSIB1002 is the MIB identifier with IANA name JUS_I.B1.002.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO141JUSIB1002 MIB = 87
+
+	// ISO143IECP271 is the MIB identifier with IANA name IEC_P27-1.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO143IECP271 MIB = 88
+
+	// ISO146Serbian is the MIB identifier with IANA name JUS_I.B1.003-serb.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO146Serbian MIB = 89
+
+	// ISO147Macedonian is the MIB identifier with IANA name JUS_I.B1.003-mac.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO147Macedonian MIB = 90
+
+	// ISO150GreekCCITT is the MIB identifier with IANA name greek-ccitt.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO150GreekCCITT MIB = 91
+
+	// ISO151Cuba is the MIB identifier with IANA name NC_NC00-10:81.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO151Cuba MIB = 92
+
+	// ISO6937Add is the MIB identifier with IANA name ISO_6937-2-25.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO6937Add MIB = 93
+
+	// ISO153GOST1976874 is the MIB identifier with IANA name GOST_19768-74.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO153GOST1976874 MIB = 94
+
+	// ISO8859Supp is the MIB identifier with IANA name ISO_8859-supp.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO8859Supp MIB = 95
+
+	// ISO10367Box is the MIB identifier with IANA name ISO_10367-box.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO10367Box MIB = 96
+
+	// ISO158Lap is the MIB identifier with IANA name latin-lap.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO158Lap MIB = 97
+
+	// ISO159JISX02121990 is the MIB identifier with IANA name JIS_X0212-1990.
+	//
+	// ISO-IR: International Register of Escape Sequences
+	// Note: The current registration authority is IPSJ/ITSCJ, Japan.
+	// Reference: RFC1345
+	ISO159JISX02121990 MIB = 98
+
+	// ISO646Danish is the MIB identifier with IANA name DS_2089.
+	//
+	// Danish Standard, DS 2089, February 1974
+	// Reference: RFC1345
+	ISO646Danish MIB = 99
+
+	// USDK is the MIB identifier with IANA name us-dk.
+	//
+	// Reference: RFC1345
+	USDK MIB = 100
+
+	// DKUS is the MIB identifier with IANA name dk-us.
+	//
+	// Reference: RFC1345
+	DKUS MIB = 101
+
+	// KSC5636 is the MIB identifier with IANA name KSC5636.
+	//
+	// Reference: RFC1345
+	KSC5636 MIB = 102
+
+	// Unicode11UTF7 is the MIB identifier with IANA name UNICODE-1-1-UTF-7.
+	//
+	// rfc1642
+	// Reference: RFC1642
+	Unicode11UTF7 MIB = 103
+
+	// ISO2022CN is the MIB identifier with IANA name ISO-2022-CN.
+	//
+	// rfc1922
+	// Reference: RFC1922
+	ISO2022CN MIB = 104
+
+	// ISO2022CNEXT is the MIB identifier with IANA name ISO-2022-CN-EXT.
+	//
+	// rfc1922
+	// Reference: RFC1922
+	ISO2022CNEXT MIB = 105
+
+	// UTF8 is the MIB identifier with IANA name UTF-8.
+	//
+	// rfc3629
+	// Reference: RFC3629
+	UTF8 MIB = 106
+
+	// ISO885913 is the MIB identifier with IANA name ISO-8859-13.
+	//
+	// ISO See https://www.iana.org/assignments/charset-reg/ISO-8859-13 https://www.iana.org/assignments/charset-reg/ISO-8859-13
+	ISO885913 MIB = 109
+
+	// ISO885914 is the MIB identifier with IANA name ISO-8859-14.
+	//
+	// ISO See https://www.iana.org/assignments/charset-reg/ISO-8859-14
+	ISO885914 MIB = 110
+
+	// ISO885915 is the MIB identifier with IANA name ISO-8859-15.
+	//
+	// ISO
+	// Please see: https://www.iana.org/assignments/charset-reg/ISO-8859-15
+	ISO885915 MIB = 111
+
+	// ISO885916 is the MIB identifier with IANA name ISO-8859-16.
+	//
+	// ISO
+	ISO885916 MIB = 112
+
+	// GBK is the MIB identifier with IANA name GBK.
+	//
+	// Chinese IT Standardization Technical Committee
+	// Please see: https://www.iana.org/assignments/charset-reg/GBK
+	GBK MIB = 113
+
+	// GB18030 is the MIB identifier with IANA name GB18030.
+	//
+	// Chinese IT Standardization Technical Committee
+	// Please see: https://www.iana.org/assignments/charset-reg/GB18030
+	GB18030 MIB = 114
+
+	// OSDEBCDICDF0415 is the MIB identifier with IANA name OSD_EBCDIC_DF04_15.
+	//
+	// Fujitsu-Siemens standard mainframe EBCDIC encoding
+	// Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-15
+	OSDEBCDICDF0415 MIB = 115
+
+	// OSDEBCDICDF03IRV is the MIB identifier with IANA name OSD_EBCDIC_DF03_IRV.
+	//
+	// Fujitsu-Siemens standard mainframe EBCDIC encoding
+	// Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF03-IRV
+	OSDEBCDICDF03IRV MIB = 116
+
+	// OSDEBCDICDF041 is the MIB identifier with IANA name OSD_EBCDIC_DF04_1.
+	//
+	// Fujitsu-Siemens standard mainframe EBCDIC encoding
+	// Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-1
+	OSDEBCDICDF041 MIB = 117
+
+	// ISO115481 is the MIB identifier with IANA name ISO-11548-1.
+	//
+	// See https://www.iana.org/assignments/charset-reg/ISO-11548-1
+	ISO115481 MIB = 118
+
+	// KZ1048 is the MIB identifier with IANA name KZ-1048.
+	//
+	// See https://www.iana.org/assignments/charset-reg/KZ-1048
+	KZ1048 MIB = 119
+
+	// Unicode is the MIB identifier with IANA name ISO-10646-UCS-2.
+	//
+	// the 2-octet Basic Multilingual Plane, aka Unicode
+	// this needs to specify network byte order: the standard
+	// does not specify (it is a 16-bit integer space)
+	Unicode MIB = 1000
+
+	// UCS4 is the MIB identifier with IANA name ISO-10646-UCS-4.
+	//
+	// the full code space. (same comment about byte order,
+	// these are 31-bit numbers.
+	UCS4 MIB = 1001
+
+	// UnicodeASCII is the MIB identifier with IANA name ISO-10646-UCS-Basic.
+	//
+	// ASCII subset of Unicode.  Basic Latin = collection 1
+	// See ISO 10646, Appendix A
+	UnicodeASCII MIB = 1002
+
+	// UnicodeLatin1 is the MIB identifier with IANA name ISO-10646-Unicode-Latin1.
+	//
+	// ISO Latin-1 subset of Unicode. Basic Latin and Latin-1
+	// Supplement  = collections 1 and 2.  See ISO 10646,
+	// Appendix A.  See rfc1815 .
+	UnicodeLatin1 MIB = 1003
+
+	// UnicodeJapanese is the MIB identifier with IANA name ISO-10646-J-1.
+	//
+	// ISO 10646 Japanese, see rfc1815 .
+	UnicodeJapanese MIB = 1004
+
+	// UnicodeIBM1261 is the MIB identifier with IANA name ISO-Unicode-IBM-1261.
+	//
+	// IBM Latin-2, -3, -5, Extended Presentation Set, GCSGID: 1261
+	UnicodeIBM1261 MIB = 1005
+
+	// UnicodeIBM1268 is the MIB identifier with IANA name ISO-Unicode-IBM-1268.
+	//
+	// IBM Latin-4 Extended Presentation Set, GCSGID: 1268
+	UnicodeIBM1268 MIB = 1006
+
+	// UnicodeIBM1276 is the MIB identifier with IANA name ISO-Unicode-IBM-1276.
+	//
+	// IBM Cyrillic Greek Extended Presentation Set, GCSGID: 1276
+	UnicodeIBM1276 MIB = 1007
+
+	// UnicodeIBM1264 is the MIB identifier with IANA name ISO-Unicode-IBM-1264.
+	//
+	// IBM Arabic Presentation Set, GCSGID: 1264
+	UnicodeIBM1264 MIB = 1008
+
+	// UnicodeIBM1265 is the MIB identifier with IANA name ISO-Unicode-IBM-1265.
+	//
+	// IBM Hebrew Presentation Set, GCSGID: 1265
+	UnicodeIBM1265 MIB = 1009
+
+	// Unicode11 is the MIB identifier with IANA name UNICODE-1-1.
+	//
+	// rfc1641
+	// Reference: RFC1641
+	Unicode11 MIB = 1010
+
+	// SCSU is the MIB identifier with IANA name SCSU.
+	//
+	// SCSU See https://www.iana.org/assignments/charset-reg/SCSU
+	SCSU MIB = 1011
+
+	// UTF7 is the MIB identifier with IANA name UTF-7.
+	//
+	// rfc2152
+	// Reference: RFC2152
+	UTF7 MIB = 1012
+
+	// UTF16BE is the MIB identifier with IANA name UTF-16BE.
+	//
+	// rfc2781
+	// Reference: RFC2781
+	UTF16BE MIB = 1013
+
+	// UTF16LE is the MIB identifier with IANA name UTF-16LE.
+	//
+	// rfc2781
+	// Reference: RFC2781
+	UTF16LE MIB = 1014
+
+	// UTF16 is the MIB identifier with IANA name UTF-16.
+	//
+	// rfc2781
+	// Reference: RFC2781
+	UTF16 MIB = 1015
+
+	// CESU8 is the MIB identifier with IANA name CESU-8.
+	//
+	// https://www.unicode.org/reports/tr26
+	CESU8 MIB = 1016
+
+	// UTF32 is the MIB identifier with IANA name UTF-32.
+	//
+	// https://www.unicode.org/reports/tr19/
+	UTF32 MIB = 1017
+
+	// UTF32BE is the MIB identifier with IANA name UTF-32BE.
+	//
+	// https://www.unicode.org/reports/tr19/
+	UTF32BE MIB = 1018
+
+	// UTF32LE is the MIB identifier with IANA name UTF-32LE.
+	//
+	// https://www.unicode.org/reports/tr19/
+	UTF32LE MIB = 1019
+
+	// BOCU1 is the MIB identifier with IANA name BOCU-1.
+	//
+	// https://www.unicode.org/notes/tn6/
+	BOCU1 MIB = 1020
+
+	// UTF7IMAP is the MIB identifier with IANA name UTF-7-IMAP.
+	//
+	// Note: This charset is used to encode Unicode in IMAP mailbox names;
+	// see section 5.1.3 of rfc3501 . It should never be used
+	// outside this context. A name has been assigned so that charset processing
+	// implementations can refer to it in a consistent way.
+	UTF7IMAP MIB = 1021
+
+	// Windows30Latin1 is the MIB identifier with IANA name ISO-8859-1-Windows-3.0-Latin-1.
+	//
+	// Extended ISO 8859-1 Latin-1 for Windows 3.0.
+	// PCL Symbol Set id: 9U
+	Windows30Latin1 MIB = 2000
+
+	// Windows31Latin1 is the MIB identifier with IANA name ISO-8859-1-Windows-3.1-Latin-1.
+	//
+	// Extended ISO 8859-1 Latin-1 for Windows 3.1.
+	// PCL Symbol Set id: 19U
+	Windows31Latin1 MIB = 2001
+
+	// Windows31Latin2 is the MIB identifier with IANA name ISO-8859-2-Windows-Latin-2.
+	//
+	// Extended ISO 8859-2.  Latin-2 for Windows 3.1.
+	// PCL Symbol Set id: 9E
+	Windows31Latin2 MIB = 2002
+
+	// Windows31Latin5 is the MIB identifier with IANA name ISO-8859-9-Windows-Latin-5.
+	//
+	// Extended ISO 8859-9.  Latin-5 for Windows 3.1
+	// PCL Symbol Set id: 5T
+	Windows31Latin5 MIB = 2003
+
+	// HPRoman8 is the MIB identifier with IANA name hp-roman8.
+	//
+	// LaserJet IIP Printer User's Manual,
+	// HP part no 33471-90901, Hewlet-Packard, June 1989.
+	// Reference: RFC1345
+	HPRoman8 MIB = 2004
+
+	// AdobeStandardEncoding is the MIB identifier with IANA name Adobe-Standard-Encoding.
+	//
+	// PostScript Language Reference Manual
+	// PCL Symbol Set id: 10J
+	AdobeStandardEncoding MIB = 2005
+
+	// VenturaUS is the MIB identifier with IANA name Ventura-US.
+	//
+	// Ventura US.  ASCII plus characters typically used in
+	// publishing, like pilcrow, copyright, registered, trade mark,
+	// section, dagger, and double dagger in the range A0 (hex)
+	// to FF (hex).
+	// PCL Symbol Set id: 14J
+	VenturaUS MIB = 2006
+
+	// VenturaInternational is the MIB identifier with IANA name Ventura-International.
+	//
+	// Ventura International.  ASCII plus coded characters similar
+	// to Roman8.
+	// PCL Symbol Set id: 13J
+	VenturaInternational MIB = 2007
+
+	// DECMCS is the MIB identifier with IANA name DEC-MCS.
+	//
+	// VAX/VMS User's Manual,
+	// Order Number: AI-Y517A-TE, April 1986.
+	// Reference: RFC1345
+	DECMCS MIB = 2008
+
+	// PC850Multilingual is the MIB identifier with IANA name IBM850.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	PC850Multilingual MIB = 2009
+
+	// PC8DanishNorwegian is the MIB identifier with IANA name PC8-Danish-Norwegian.
+	//
+	// PC Danish Norwegian
+	// 8-bit PC set for Danish Norwegian
+	// PCL Symbol Set id: 11U
+	PC8DanishNorwegian MIB = 2012
+
+	// PC862LatinHebrew is the MIB identifier with IANA name IBM862.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	PC862LatinHebrew MIB = 2013
+
+	// PC8Turkish is the MIB identifier with IANA name PC8-Turkish.
+	//
+	// PC Latin Turkish.  PCL Symbol Set id: 9T
+	PC8Turkish MIB = 2014
+
+	// IBMSymbols is the MIB identifier with IANA name IBM-Symbols.
+	//
+	// Presentation Set, CPGID: 259
+	IBMSymbols MIB = 2015
+
+	// IBMThai is the MIB identifier with IANA name IBM-Thai.
+	//
+	// Presentation Set, CPGID: 838
+	IBMThai MIB = 2016
+
+	// HPLegal is the MIB identifier with IANA name HP-Legal.
+	//
+	// PCL 5 Comparison Guide, Hewlett-Packard,
+	// HP part number 5961-0510, October 1992
+	// PCL Symbol Set id: 1U
+	HPLegal MIB = 2017
+
+	// HPPiFont is the MIB identifier with IANA name HP-Pi-font.
+	//
+	// PCL 5 Comparison Guide, Hewlett-Packard,
+	// HP part number 5961-0510, October 1992
+	// PCL Symbol Set id: 15U
+	HPPiFont MIB = 2018
+
+	// HPMath8 is the MIB identifier with IANA name HP-Math8.
+	//
+	// PCL 5 Comparison Guide, Hewlett-Packard,
+	// HP part number 5961-0510, October 1992
+	// PCL Symbol Set id: 8M
+	HPMath8 MIB = 2019
+
+	// HPPSMath is the MIB identifier with IANA name Adobe-Symbol-Encoding.
+	//
+	// PostScript Language Reference Manual
+	// PCL Symbol Set id: 5M
+	HPPSMath MIB = 2020
+
+	// HPDesktop is the MIB identifier with IANA name HP-DeskTop.
+	//
+	// PCL 5 Comparison Guide, Hewlett-Packard,
+	// HP part number 5961-0510, October 1992
+	// PCL Symbol Set id: 7J
+	HPDesktop MIB = 2021
+
+	// VenturaMath is the MIB identifier with IANA name Ventura-Math.
+	//
+	// PCL 5 Comparison Guide, Hewlett-Packard,
+	// HP part number 5961-0510, October 1992
+	// PCL Symbol Set id: 6M
+	VenturaMath MIB = 2022
+
+	// MicrosoftPublishing is the MIB identifier with IANA name Microsoft-Publishing.
+	//
+	// PCL 5 Comparison Guide, Hewlett-Packard,
+	// HP part number 5961-0510, October 1992
+	// PCL Symbol Set id: 6J
+	MicrosoftPublishing MIB = 2023
+
+	// Windows31J is the MIB identifier with IANA name Windows-31J.
+	//
+	// Windows Japanese.  A further extension of Shift_JIS
+	// to include NEC special characters (Row 13), NEC
+	// selection of IBM extensions (Rows 89 to 92), and IBM
+	// extensions (Rows 115 to 119).  The CCS's are
+	// JIS X0201:1997, JIS X0208:1997, and these extensions.
+	// This charset can be used for the top-level media type "text",
+	// but it is of limited or specialized use (see rfc2278 ).
+	// PCL Symbol Set id: 19K
+	Windows31J MIB = 2024
+
+	// GB2312 is the MIB identifier with IANA name GB2312 (MIME: GB2312).
+	//
+	// Chinese for People's Republic of China (PRC) mixed one byte,
+	// two byte set:
+	// 20-7E = one byte ASCII
+	// A1-FE = two byte PRC Kanji
+	// See GB 2312-80
+	// PCL Symbol Set Id: 18C
+	GB2312 MIB = 2025
+
+	// Big5 is the MIB identifier with IANA name Big5 (MIME: Big5).
+	//
+	// Chinese for Taiwan Multi-byte set.
+	// PCL Symbol Set Id: 18T
+	Big5 MIB = 2026
+
+	// Macintosh is the MIB identifier with IANA name macintosh.
+	//
+	// The Unicode Standard ver1.0, ISBN 0-201-56788-1, Oct 1991
+	// Reference: RFC1345
+	Macintosh MIB = 2027
+
+	// IBM037 is the MIB identifier with IANA name IBM037.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM037 MIB = 2028
+
+	// IBM038 is the MIB identifier with IANA name IBM038.
+	//
+	// IBM 3174 Character Set Ref, GA27-3831-02, March 1990
+	// Reference: RFC1345
+	IBM038 MIB = 2029
+
+	// IBM273 is the MIB identifier with IANA name IBM273.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM273 MIB = 2030
+
+	// IBM274 is the MIB identifier with IANA name IBM274.
+	//
+	// IBM 3174 Character Set Ref, GA27-3831-02, March 1990
+	// Reference: RFC1345
+	IBM274 MIB = 2031
+
+	// IBM275 is the MIB identifier with IANA name IBM275.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM275 MIB = 2032
+
+	// IBM277 is the MIB identifier with IANA name IBM277.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM277 MIB = 2033
+
+	// IBM278 is the MIB identifier with IANA name IBM278.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM278 MIB = 2034
+
+	// IBM280 is the MIB identifier with IANA name IBM280.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM280 MIB = 2035
+
+	// IBM281 is the MIB identifier with IANA name IBM281.
+	//
+	// IBM 3174 Character Set Ref, GA27-3831-02, March 1990
+	// Reference: RFC1345
+	IBM281 MIB = 2036
+
+	// IBM284 is the MIB identifier with IANA name IBM284.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM284 MIB = 2037
+
+	// IBM285 is the MIB identifier with IANA name IBM285.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM285 MIB = 2038
+
+	// IBM290 is the MIB identifier with IANA name IBM290.
+	//
+	// IBM 3174 Character Set Ref, GA27-3831-02, March 1990
+	// Reference: RFC1345
+	IBM290 MIB = 2039
+
+	// IBM297 is the MIB identifier with IANA name IBM297.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM297 MIB = 2040
+
+	// IBM420 is the MIB identifier with IANA name IBM420.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990,
+	// IBM NLS RM p 11-11
+	// Reference: RFC1345
+	IBM420 MIB = 2041
+
+	// IBM423 is the MIB identifier with IANA name IBM423.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM423 MIB = 2042
+
+	// IBM424 is the MIB identifier with IANA name IBM424.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM424 MIB = 2043
+
+	// PC8CodePage437 is the MIB identifier with IANA name IBM437.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	PC8CodePage437 MIB = 2011
+
+	// IBM500 is the MIB identifier with IANA name IBM500.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM500 MIB = 2044
+
+	// IBM851 is the MIB identifier with IANA name IBM851.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM851 MIB = 2045
+
+	// PCp852 is the MIB identifier with IANA name IBM852.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	PCp852 MIB = 2010
+
+	// IBM855 is the MIB identifier with IANA name IBM855.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM855 MIB = 2046
+
+	// IBM857 is the MIB identifier with IANA name IBM857.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM857 MIB = 2047
+
+	// IBM860 is the MIB identifier with IANA name IBM860.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM860 MIB = 2048
+
+	// IBM861 is the MIB identifier with IANA name IBM861.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM861 MIB = 2049
+
+	// IBM863 is the MIB identifier with IANA name IBM863.
+	//
+	// IBM Keyboard layouts and code pages, PN 07G4586 June 1991
+	// Reference: RFC1345
+	IBM863 MIB = 2050
+
+	// IBM864 is the MIB identifier with IANA name IBM864.
+	//
+	// IBM Keyboard layouts and code pages, PN 07G4586 June 1991
+	// Reference: RFC1345
+	IBM864 MIB = 2051
+
+	// IBM865 is the MIB identifier with IANA name IBM865.
+	//
+	// IBM DOS 3.3 Ref (Abridged), 94X9575 (Feb 1987)
+	// Reference: RFC1345
+	IBM865 MIB = 2052
+
+	// IBM868 is the MIB identifier with IANA name IBM868.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM868 MIB = 2053
+
+	// IBM869 is the MIB identifier with IANA name IBM869.
+	//
+	// IBM Keyboard layouts and code pages, PN 07G4586 June 1991
+	// Reference: RFC1345
+	IBM869 MIB = 2054
+
+	// IBM870 is the MIB identifier with IANA name IBM870.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM870 MIB = 2055
+
+	// IBM871 is the MIB identifier with IANA name IBM871.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM871 MIB = 2056
+
+	// IBM880 is the MIB identifier with IANA name IBM880.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM880 MIB = 2057
+
+	// IBM891 is the MIB identifier with IANA name IBM891.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM891 MIB = 2058
+
+	// IBM903 is the MIB identifier with IANA name IBM903.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM903 MIB = 2059
+
+	// IBBM904 is the MIB identifier with IANA name IBM904.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBBM904 MIB = 2060
+
+	// IBM905 is the MIB identifier with IANA name IBM905.
+	//
+	// IBM 3174 Character Set Ref, GA27-3831-02, March 1990
+	// Reference: RFC1345
+	IBM905 MIB = 2061
+
+	// IBM918 is the MIB identifier with IANA name IBM918.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM918 MIB = 2062
+
+	// IBM1026 is the MIB identifier with IANA name IBM1026.
+	//
+	// IBM NLS RM Vol2 SE09-8002-01, March 1990
+	// Reference: RFC1345
+	IBM1026 MIB = 2063
+
+	// IBMEBCDICATDE is the MIB identifier with IANA name EBCDIC-AT-DE.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	IBMEBCDICATDE MIB = 2064
+
+	// EBCDICATDEA is the MIB identifier with IANA name EBCDIC-AT-DE-A.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICATDEA MIB = 2065
+
+	// EBCDICCAFR is the MIB identifier with IANA name EBCDIC-CA-FR.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICCAFR MIB = 2066
+
+	// EBCDICDKNO is the MIB identifier with IANA name EBCDIC-DK-NO.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICDKNO MIB = 2067
+
+	// EBCDICDKNOA is the MIB identifier with IANA name EBCDIC-DK-NO-A.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICDKNOA MIB = 2068
+
+	// EBCDICFISE is the MIB identifier with IANA name EBCDIC-FI-SE.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICFISE MIB = 2069
+
+	// EBCDICFISEA is the MIB identifier with IANA name EBCDIC-FI-SE-A.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICFISEA MIB = 2070
+
+	// EBCDICFR is the MIB identifier with IANA name EBCDIC-FR.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICFR MIB = 2071
+
+	// EBCDICIT is the MIB identifier with IANA name EBCDIC-IT.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICIT MIB = 2072
+
+	// EBCDICPT is the MIB identifier with IANA name EBCDIC-PT.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICPT MIB = 2073
+
+	// EBCDICES is the MIB identifier with IANA name EBCDIC-ES.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICES MIB = 2074
+
+	// EBCDICESA is the MIB identifier with IANA name EBCDIC-ES-A.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICESA MIB = 2075
+
+	// EBCDICESS is the MIB identifier with IANA name EBCDIC-ES-S.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICESS MIB = 2076
+
+	// EBCDICUK is the MIB identifier with IANA name EBCDIC-UK.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICUK MIB = 2077
+
+	// EBCDICUS is the MIB identifier with IANA name EBCDIC-US.
+	//
+	// IBM 3270 Char Set Ref Ch 10, GA27-2837-9, April 1987
+	// Reference: RFC1345
+	EBCDICUS MIB = 2078
+
+	// Unknown8BiT is the MIB identifier with IANA name UNKNOWN-8BIT.
+	//
+	// Reference: RFC1428
+	Unknown8BiT MIB = 2079
+
+	// Mnemonic is the MIB identifier with IANA name MNEMONIC.
+	//
+	// rfc1345 , also known as "mnemonic+ascii+38"
+	// Reference: RFC1345
+	Mnemonic MIB = 2080
+
+	// Mnem is the MIB identifier with IANA name MNEM.
+	//
+	// rfc1345 , also known as "mnemonic+ascii+8200"
+	// Reference: RFC1345
+	Mnem MIB = 2081
+
+	// VISCII is the MIB identifier with IANA name VISCII.
+	//
+	// rfc1456
+	// Reference: RFC1456
+	VISCII MIB = 2082
+
+	// VIQR is the MIB identifier with IANA name VIQR.
+	//
+	// rfc1456
+	// Reference: RFC1456
+	VIQR MIB = 2083
+
+	// KOI8R is the MIB identifier with IANA name KOI8-R (MIME: KOI8-R).
+	//
+	// rfc1489 , based on GOST-19768-74, ISO-6937/8,
+	// INIS-Cyrillic, ISO-5427.
+	// Reference: RFC1489
+	KOI8R MIB = 2084
+
+	// HZGB2312 is the MIB identifier with IANA name HZ-GB-2312.
+	//
+	// rfc1842 , rfc1843 rfc1843 rfc1842
+	HZGB2312 MIB = 2085
+
+	// IBM866 is the MIB identifier with IANA name IBM866.
+	//
+	// IBM NLDG Volume 2 (SE09-8002-03) August 1994
+	IBM866 MIB = 2086
+
+	// PC775Baltic is the MIB identifier with IANA name IBM775.
+	//
+	// HP PCL 5 Comparison Guide (P/N 5021-0329) pp B-13, 1996
+	PC775Baltic MIB = 2087
+
+	// KOI8U is the MIB identifier with IANA name KOI8-U.
+	//
+	// rfc2319
+	// Reference: RFC2319
+	KOI8U MIB = 2088
+
+	// IBM00858 is the MIB identifier with IANA name IBM00858.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM00858
+	IBM00858 MIB = 2089
+
+	// IBM00924 is the MIB identifier with IANA name IBM00924.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM00924
+	IBM00924 MIB = 2090
+
+	// IBM01140 is the MIB identifier with IANA name IBM01140.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM01140
+	IBM01140 MIB = 2091
+
+	// IBM01141 is the MIB identifier with IANA name IBM01141.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM01141
+	IBM01141 MIB = 2092
+
+	// IBM01142 is the MIB identifier with IANA name IBM01142.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM01142
+	IBM01142 MIB = 2093
+
+	// IBM01143 is the MIB identifier with IANA name IBM01143.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM01143
+	IBM01143 MIB = 2094
+
+	// IBM01144 is the MIB identifier with IANA name IBM01144.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM01144
+	IBM01144 MIB = 2095
+
+	// IBM01145 is the MIB identifier with IANA name IBM01145.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM01145
+	IBM01145 MIB = 2096
+
+	// IBM01146 is the MIB identifier with IANA name IBM01146.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM01146
+	IBM01146 MIB = 2097
+
+	// IBM01147 is the MIB identifier with IANA name IBM01147.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM01147
+	IBM01147 MIB = 2098
+
+	// IBM01148 is the MIB identifier with IANA name IBM01148.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM01148
+	IBM01148 MIB = 2099
+
+	// IBM01149 is the MIB identifier with IANA name IBM01149.
+	//
+	// IBM See https://www.iana.org/assignments/charset-reg/IBM01149
+	IBM01149 MIB = 2100
+
+	// Big5HKSCS is the MIB identifier with IANA name Big5-HKSCS.
+	//
+	// See https://www.iana.org/assignments/charset-reg/Big5-HKSCS
+	Big5HKSCS MIB = 2101
+
+	// IBM1047 is the MIB identifier with IANA name IBM1047.
+	//
+	// IBM1047 (EBCDIC Latin 1/Open Systems) https://www-1.ibm.com/servers/eserver/iseries/software/globalization/pdf/cp01047z.pdf
+	IBM1047 MIB = 2102
+
+	// PTCP154 is the MIB identifier with IANA name PTCP154.
+	//
+	// See https://www.iana.org/assignments/charset-reg/PTCP154
+	PTCP154 MIB = 2103
+
+	// Amiga1251 is the MIB identifier with IANA name Amiga-1251.
+	//
+	// See https://www.amiga.ultranet.ru/Amiga-1251.html
+	Amiga1251 MIB = 2104
+
+	// KOI7switched is the MIB identifier with IANA name KOI7-switched.
+	//
+	// See https://www.iana.org/assignments/charset-reg/KOI7-switched
+	KOI7switched MIB = 2105
+
+	// BRF is the MIB identifier with IANA name BRF.
+	//
+	// See https://www.iana.org/assignments/charset-reg/BRF
+	BRF MIB = 2106
+
+	// TSCII is the MIB identifier with IANA name TSCII.
+	//
+	// See https://www.iana.org/assignments/charset-reg/TSCII
+	TSCII MIB = 2107
+
+	// CP51932 is the MIB identifier with IANA name CP51932.
+	//
+	// See https://www.iana.org/assignments/charset-reg/CP51932
+	CP51932 MIB = 2108
+
+	// Windows874 is the MIB identifier with IANA name windows-874.
+	//
+	// See https://www.iana.org/assignments/charset-reg/windows-874
+	Windows874 MIB = 2109
+
+	// Windows1250 is the MIB identifier with IANA name windows-1250.
+	//
+	// Microsoft https://www.iana.org/assignments/charset-reg/windows-1250
+	Windows1250 MIB = 2250
+
+	// Windows1251 is the MIB identifier with IANA name windows-1251.
+	//
+	// Microsoft https://www.iana.org/assignments/charset-reg/windows-1251
+	Windows1251 MIB = 2251
+
+	// Windows1252 is the MIB identifier with IANA name windows-1252.
+	//
+	// Microsoft https://www.iana.org/assignments/charset-reg/windows-1252
+	Windows1252 MIB = 2252
+
+	// Windows1253 is the MIB identifier with IANA name windows-1253.
+	//
+	// Microsoft https://www.iana.org/assignments/charset-reg/windows-1253
+	Windows1253 MIB = 2253
+
+	// Windows1254 is the MIB identifier with IANA name windows-1254.
+	//
+	// Microsoft https://www.iana.org/assignments/charset-reg/windows-1254
+	Windows1254 MIB = 2254
+
+	// Windows1255 is the MIB identifier with IANA name windows-1255.
+	//
+	// Microsoft https://www.iana.org/assignments/charset-reg/windows-1255
+	Windows1255 MIB = 2255
+
+	// Windows1256 is the MIB identifier with IANA name windows-1256.
+	//
+	// Microsoft https://www.iana.org/assignments/charset-reg/windows-1256
+	Windows1256 MIB = 2256
+
+	// Windows1257 is the MIB identifier with IANA name windows-1257.
+	//
+	// Microsoft https://www.iana.org/assignments/charset-reg/windows-1257
+	Windows1257 MIB = 2257
+
+	// Windows1258 is the MIB identifier with IANA name windows-1258.
+	//
+	// Microsoft https://www.iana.org/assignments/charset-reg/windows-1258
+	Windows1258 MIB = 2258
+
+	// TIS620 is the MIB identifier with IANA name TIS-620.
+	//
+	// Thai Industrial Standards Institute (TISI)
+	TIS620 MIB = 2259
+
+	// CP50220 is the MIB identifier with IANA name CP50220.
+	//
+	// See https://www.iana.org/assignments/charset-reg/CP50220
+	CP50220 MIB = 2260
+)
diff --git a/vendor/golang.org/x/text/encoding/internal/internal.go b/vendor/golang.org/x/text/encoding/internal/internal.go
new file mode 100644
index 0000000000..413e6fc6d7
--- /dev/null
+++ b/vendor/golang.org/x/text/encoding/internal/internal.go
@@ -0,0 +1,75 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package internal contains code that is shared among encoding implementations.
+package internal
+
+import (
+	"golang.org/x/text/encoding"
+	"golang.org/x/text/encoding/internal/identifier"
+	"golang.org/x/text/transform"
+)
+
+// Encoding is an implementation of the Encoding interface that adds the String
+// and ID methods to an existing encoding.
+type Encoding struct {
+	encoding.Encoding
+	Name string
+	MIB  identifier.MIB
+}
+
+// _ verifies that Encoding implements identifier.Interface.
+var _ identifier.Interface = (*Encoding)(nil)
+
+func (e *Encoding) String() string {
+	return e.Name
+}
+
+func (e *Encoding) ID() (mib identifier.MIB, other string) {
+	return e.MIB, ""
+}
+
+// SimpleEncoding is an Encoding that combines two Transformers.
+type SimpleEncoding struct {
+	Decoder transform.Transformer
+	Encoder transform.Transformer
+}
+
+func (e *SimpleEncoding) NewDecoder() *encoding.Decoder {
+	return &encoding.Decoder{Transformer: e.Decoder}
+}
+
+func (e *SimpleEncoding) NewEncoder() *encoding.Encoder {
+	return &encoding.Encoder{Transformer: e.Encoder}
+}
+
+// FuncEncoding is an Encoding that combines two functions returning a new
+// Transformer.
+type FuncEncoding struct {
+	Decoder func() transform.Transformer
+	Encoder func() transform.Transformer
+}
+
+func (e FuncEncoding) NewDecoder() *encoding.Decoder {
+	return &encoding.Decoder{Transformer: e.Decoder()}
+}
+
+func (e FuncEncoding) NewEncoder() *encoding.Encoder {
+	return &encoding.Encoder{Transformer: e.Encoder()}
+}
+
+// A RepertoireError indicates a rune is not in the repertoire of a destination
+// encoding. It is associated with an encoding-specific suggested replacement
+// byte.
+type RepertoireError byte
+
+// Error implements the error interface.
+func (r RepertoireError) Error() string {
+	return "encoding: rune not supported by encoding."
+}
+
+// Replacement returns the replacement string associated with this error.
+func (r RepertoireError) Replacement() byte { return byte(r) }
+
+var ErrASCIIReplacement = RepertoireError(encoding.ASCIISub)
diff --git a/vendor/golang.org/x/text/encoding/unicode/override.go b/vendor/golang.org/x/text/encoding/unicode/override.go
new file mode 100644
index 0000000000..35d62fcc99
--- /dev/null
+++ b/vendor/golang.org/x/text/encoding/unicode/override.go
@@ -0,0 +1,82 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unicode
+
+import (
+	"golang.org/x/text/transform"
+)
+
+// BOMOverride returns a new decoder transformer that is identical to fallback,
+// except that the presence of a Byte Order Mark at the start of the input
+// causes it to switch to the corresponding Unicode decoding. It will only
+// consider BOMs for UTF-8, UTF-16BE, and UTF-16LE.
+//
+// This differs from using ExpectBOM by allowing a BOM to switch to UTF-8, not
+// just UTF-16 variants, and allowing falling back to any encoding scheme.
+//
+// This technique is recommended by the W3C for use in HTML 5: "For
+// compatibility with deployed content, the byte order mark (also known as BOM)
+// is considered more authoritative than anything else."
+// http://www.w3.org/TR/encoding/#specification-hooks
+//
+// Using BOMOverride is mostly intended for use cases where the first characters
+// of a fallback encoding are known to not be a BOM, for example, for valid HTML
+// and most encodings.
+func BOMOverride(fallback transform.Transformer) transform.Transformer {
+	// TODO: possibly allow a variadic argument of unicode encodings to allow
+	// specifying details of which fallbacks are supported as well as
+	// specifying the details of the implementations. This would also allow for
+	// support for UTF-32, which should not be supported by default.
+	return &bomOverride{fallback: fallback}
+}
+
+type bomOverride struct {
+	fallback transform.Transformer
+	current  transform.Transformer
+}
+
+func (d *bomOverride) Reset() {
+	d.current = nil
+	d.fallback.Reset()
+}
+
+var (
+	// TODO: we could use decode functions here, instead of allocating a new
+	// decoder on every NewDecoder as IgnoreBOM decoders can be stateless.
+	utf16le = UTF16(LittleEndian, IgnoreBOM)
+	utf16be = UTF16(BigEndian, IgnoreBOM)
+)
+
+const utf8BOM = "\ufeff"
+
+func (d *bomOverride) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	if d.current != nil {
+		return d.current.Transform(dst, src, atEOF)
+	}
+	if len(src) < 3 && !atEOF {
+		return 0, 0, transform.ErrShortSrc
+	}
+	d.current = d.fallback
+	bomSize := 0
+	if len(src) >= 2 {
+		if src[0] == 0xFF && src[1] == 0xFE {
+			d.current = utf16le.NewDecoder()
+			bomSize = 2
+		} else if src[0] == 0xFE && src[1] == 0xFF {
+			d.current = utf16be.NewDecoder()
+			bomSize = 2
+		} else if len(src) >= 3 &&
+			src[0] == utf8BOM[0] &&
+			src[1] == utf8BOM[1] &&
+			src[2] == utf8BOM[2] {
+			d.current = transform.Nop
+			bomSize = 3
+		}
+	}
+	if bomSize < len(src) {
+		nDst, nSrc, err = d.current.Transform(dst, src[bomSize:], atEOF)
+	}
+	return nDst, nSrc + bomSize, err
+}
diff --git a/vendor/golang.org/x/text/encoding/unicode/unicode.go b/vendor/golang.org/x/text/encoding/unicode/unicode.go
new file mode 100644
index 0000000000..dd99ad14d3
--- /dev/null
+++ b/vendor/golang.org/x/text/encoding/unicode/unicode.go
@@ -0,0 +1,512 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package unicode provides Unicode encodings such as UTF-16.
+package unicode // import "golang.org/x/text/encoding/unicode"
+
+import (
+	"bytes"
+	"errors"
+	"unicode/utf16"
+	"unicode/utf8"
+
+	"golang.org/x/text/encoding"
+	"golang.org/x/text/encoding/internal"
+	"golang.org/x/text/encoding/internal/identifier"
+	"golang.org/x/text/internal/utf8internal"
+	"golang.org/x/text/runes"
+	"golang.org/x/text/transform"
+)
+
+// TODO: I think the Transformers really should return errors on unmatched
+// surrogate pairs and odd numbers of bytes. This is not required by RFC 2781,
+// which leaves it open, but is suggested by WhatWG. It will allow for all error
+// modes as defined by WhatWG: fatal, HTML and Replacement. This would require
+// the introduction of some kind of error type for conveying the erroneous code
+// point.
+
+// UTF8 is the UTF-8 encoding. It neither removes nor adds byte order marks.
+var UTF8 encoding.Encoding = utf8enc
+
+// UTF8BOM is an UTF-8 encoding where the decoder strips a leading byte order
+// mark while the encoder adds one.
+//
+// Some editors add a byte order mark as a signature to UTF-8 files. Although
+// the byte order mark is not useful for detecting byte order in UTF-8, it is
+// sometimes used as a convention to mark UTF-8-encoded files. This relies on
+// the observation that the UTF-8 byte order mark is either an illegal or at
+// least very unlikely sequence in any other character encoding.
+var UTF8BOM encoding.Encoding = utf8bomEncoding{}
+
+type utf8bomEncoding struct{}
+
+func (utf8bomEncoding) String() string {
+	return "UTF-8-BOM"
+}
+
+func (utf8bomEncoding) ID() (identifier.MIB, string) {
+	return identifier.Unofficial, "x-utf8bom"
+}
+
+func (utf8bomEncoding) NewEncoder() *encoding.Encoder {
+	return &encoding.Encoder{
+		Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()},
+	}
+}
+
+func (utf8bomEncoding) NewDecoder() *encoding.Decoder {
+	return &encoding.Decoder{Transformer: &utf8bomDecoder{}}
+}
+
+var utf8enc = &internal.Encoding{
+	&internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
+	"UTF-8",
+	identifier.UTF8,
+}
+
+type utf8bomDecoder struct {
+	checked bool
+}
+
+func (t *utf8bomDecoder) Reset() {
+	t.checked = false
+}
+
+func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	if !t.checked {
+		if !atEOF && len(src) < len(utf8BOM) {
+			if len(src) == 0 {
+				return 0, 0, nil
+			}
+			return 0, 0, transform.ErrShortSrc
+		}
+		if bytes.HasPrefix(src, []byte(utf8BOM)) {
+			nSrc += len(utf8BOM)
+			src = src[len(utf8BOM):]
+		}
+		t.checked = true
+	}
+	nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
+	nSrc += n
+	return nDst, nSrc, err
+}
+
+type utf8bomEncoder struct {
+	written bool
+	t       transform.Transformer
+}
+
+func (t *utf8bomEncoder) Reset() {
+	t.written = false
+	t.t.Reset()
+}
+
+func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	if !t.written {
+		if len(dst) < len(utf8BOM) {
+			return nDst, 0, transform.ErrShortDst
+		}
+		nDst = copy(dst, utf8BOM)
+		t.written = true
+	}
+	n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
+	nDst += n
+	return nDst, nSrc, err
+}
+
+type utf8Decoder struct{ transform.NopResetter }
+
+func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	var pSrc int // point from which to start copy in src
+	var accept utf8internal.AcceptRange
+
+	// The decoder can only make the input larger, not smaller.
+	n := len(src)
+	if len(dst) < n {
+		err = transform.ErrShortDst
+		n = len(dst)
+		atEOF = false
+	}
+	for nSrc < n {
+		c := src[nSrc]
+		if c < utf8.RuneSelf {
+			nSrc++
+			continue
+		}
+		first := utf8internal.First[c]
+		size := int(first & utf8internal.SizeMask)
+		if first == utf8internal.FirstInvalid {
+			goto handleInvalid // invalid starter byte
+		}
+		accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]
+		if nSrc+size > n {
+			if !atEOF {
+				// We may stop earlier than necessary here if the short sequence
+				// has invalid bytes. Not checking for this simplifies the code
+				// and may avoid duplicate computations in certain conditions.
+				if err == nil {
+					err = transform.ErrShortSrc
+				}
+				break
+			}
+			// Determine the maximal subpart of an ill-formed subsequence.
+			switch {
+			case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]:
+				size = 1
+			case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]:
+				size = 2
+			default:
+				size = 3 // As we are short, the maximum is 3.
+			}
+			goto handleInvalid
+		}
+		if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c {
+			size = 1
+			goto handleInvalid // invalid continuation byte
+		} else if size == 2 {
+		} else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c {
+			size = 2
+			goto handleInvalid // invalid continuation byte
+		} else if size == 3 {
+		} else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c {
+			size = 3
+			goto handleInvalid // invalid continuation byte
+		}
+		nSrc += size
+		continue
+
+	handleInvalid:
+		// Copy the scanned input so far.
+		nDst += copy(dst[nDst:], src[pSrc:nSrc])
+
+		// Append RuneError to the destination.
+		const runeError = "\ufffd"
+		if nDst+len(runeError) > len(dst) {
+			return nDst, nSrc, transform.ErrShortDst
+		}
+		nDst += copy(dst[nDst:], runeError)
+
+		// Skip the maximal subpart of an ill-formed subsequence according to
+		// the W3C standard way instead of the Go way. This Transform is
+		// probably the only place in the text repo where it is warranted.
+		nSrc += size
+		pSrc = nSrc
+
+		// Recompute the maximum source length.
+		if sz := len(dst) - nDst; sz < len(src)-nSrc {
+			err = transform.ErrShortDst
+			n = nSrc + sz
+			atEOF = false
+		}
+	}
+	return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err
+}
+
+// UTF16 returns a UTF-16 Encoding for the given default endianness and byte
+// order mark (BOM) policy.
+//
+// When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then
+// neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect
+// the endianness used for decoding, and will instead be output as their
+// standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy
+// is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output.
+// Instead, it overrides the default endianness e for the remainder of the
+// transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not
+// affect the endianness used, and will instead be output as their standard
+// UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed
+// with the default Endianness. For ExpectBOM, in that case, the transformation
+// will return early with an ErrMissingBOM error.
+//
+// When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of
+// the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not
+// be inserted. The UTF-8 input does not need to contain a BOM.
+//
+// There is no concept of a 'native' endianness. If the UTF-16 data is produced
+// and consumed in a greater context that implies a certain endianness, use
+// IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.
+//
+// In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
+// corresponds to "Where the precise type of the data stream is known... the
+// BOM should not be used" and ExpectBOM corresponds to "A particular
+// protocol... may require use of the BOM".
+func UTF16(e Endianness, b BOMPolicy) encoding.Encoding {
+	return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]}
+}
+
+// mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that
+// some configurations map to the same MIB identifier. RFC 2781 has requirements
+// and recommendations. Some of the "configurations" are merely recommendations,
+// so multiple configurations could match.
+var mibValue = map[Endianness][numBOMValues]identifier.MIB{
+	BigEndian: [numBOMValues]identifier.MIB{
+		IgnoreBOM: identifier.UTF16BE,
+		UseBOM:    identifier.UTF16, // BigEnding default is preferred by RFC 2781.
+		// TODO: acceptBOM | strictBOM would map to UTF16BE as well.
+	},
+	LittleEndian: [numBOMValues]identifier.MIB{
+		IgnoreBOM: identifier.UTF16LE,
+		UseBOM:    identifier.UTF16, // LittleEndian default is allowed and preferred on Windows.
+		// TODO: acceptBOM | strictBOM would map to UTF16LE as well.
+	},
+	// ExpectBOM is not widely used and has no valid MIB identifier.
+}
+
+// All lists a configuration for each IANA-defined UTF-16 variant.
+var All = []encoding.Encoding{
+	UTF8,
+	UTF16(BigEndian, UseBOM),
+	UTF16(BigEndian, IgnoreBOM),
+	UTF16(LittleEndian, IgnoreBOM),
+}
+
+// BOMPolicy is a UTF-16 encoding's byte order mark policy.
+type BOMPolicy uint8
+
+const (
+	writeBOM   BOMPolicy = 0x01
+	acceptBOM  BOMPolicy = 0x02
+	requireBOM BOMPolicy = 0x04
+	bomMask    BOMPolicy = 0x07
+
+	// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
+	// map of an array of length 8 of a type that is also used as a key or value
+	// in another map). See golang.org/issue/11354.
+	// TODO: consider changing this value back to 8 if the use of 1.4.* has
+	// been minimized.
+	numBOMValues = 8 + 1
+
+	// IgnoreBOM means to ignore any byte order marks.
+	IgnoreBOM BOMPolicy = 0
+	// Common and RFC 2781-compliant interpretation for UTF-16BE/LE.
+
+	// UseBOM means that the UTF-16 form may start with a byte order mark, which
+	// will be used to override the default encoding.
+	UseBOM BOMPolicy = writeBOM | acceptBOM
+	// Common and RFC 2781-compliant interpretation for UTF-16.
+
+	// ExpectBOM means that the UTF-16 form must start with a byte order mark,
+	// which will be used to override the default encoding.
+	ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
+	// Used in Java as Unicode (not to be confused with Java's UTF-16) and
+	// ICU's UTF-16,version=1. Not compliant with RFC 2781.
+
+	// TODO (maybe): strictBOM: BOM must match Endianness. This would allow:
+	// - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM
+	//    (UnicodeBig and UnicodeLittle in Java)
+	// - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E:
+	//    acceptBOM | strictBOM (e.g. assigned to CheckBOM).
+	// This addition would be consistent with supporting ExpectBOM.
+)
+
+// Endianness is a UTF-16 encoding's default endianness.
+type Endianness bool
+
+const (
+	// BigEndian is UTF-16BE.
+	BigEndian Endianness = false
+	// LittleEndian is UTF-16LE.
+	LittleEndian Endianness = true
+)
+
+// ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a
+// starting byte order mark.
+var ErrMissingBOM = errors.New("encoding: missing byte order mark")
+
+type utf16Encoding struct {
+	config
+	mib identifier.MIB
+}
+
+type config struct {
+	endianness Endianness
+	bomPolicy  BOMPolicy
+}
+
+func (u utf16Encoding) NewDecoder() *encoding.Decoder {
+	return &encoding.Decoder{Transformer: &utf16Decoder{
+		initial: u.config,
+		current: u.config,
+	}}
+}
+
+func (u utf16Encoding) NewEncoder() *encoding.Encoder {
+	return &encoding.Encoder{Transformer: &utf16Encoder{
+		endianness:       u.endianness,
+		initialBOMPolicy: u.bomPolicy,
+		currentBOMPolicy: u.bomPolicy,
+	}}
+}
+
+func (u utf16Encoding) ID() (mib identifier.MIB, other string) {
+	return u.mib, ""
+}
+
+func (u utf16Encoding) String() string {
+	e, b := "B", ""
+	if u.endianness == LittleEndian {
+		e = "L"
+	}
+	switch u.bomPolicy {
+	case ExpectBOM:
+		b = "Expect"
+	case UseBOM:
+		b = "Use"
+	case IgnoreBOM:
+		b = "Ignore"
+	}
+	return "UTF-16" + e + "E (" + b + " BOM)"
+}
+
+type utf16Decoder struct {
+	initial config
+	current config
+}
+
+func (u *utf16Decoder) Reset() {
+	u.current = u.initial
+}
+
+func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	if len(src) < 2 && atEOF && u.current.bomPolicy&requireBOM != 0 {
+		return 0, 0, ErrMissingBOM
+	}
+	if len(src) == 0 {
+		return 0, 0, nil
+	}
+	if len(src) >= 2 && u.current.bomPolicy&acceptBOM != 0 {
+		switch {
+		case src[0] == 0xfe && src[1] == 0xff:
+			u.current.endianness = BigEndian
+			nSrc = 2
+		case src[0] == 0xff && src[1] == 0xfe:
+			u.current.endianness = LittleEndian
+			nSrc = 2
+		default:
+			if u.current.bomPolicy&requireBOM != 0 {
+				return 0, 0, ErrMissingBOM
+			}
+		}
+		u.current.bomPolicy = IgnoreBOM
+	}
+
+	var r rune
+	var dSize, sSize int
+	for nSrc < len(src) {
+		if nSrc+1 < len(src) {
+			x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1])
+			if u.current.endianness == LittleEndian {
+				x = x>>8 | x<<8
+			}
+			r, sSize = rune(x), 2
+			if utf16.IsSurrogate(r) {
+				if nSrc+3 < len(src) {
+					x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3])
+					if u.current.endianness == LittleEndian {
+						x = x>>8 | x<<8
+					}
+					// Save for next iteration if it is not a high surrogate.
+					if isHighSurrogate(rune(x)) {
+						r, sSize = utf16.DecodeRune(r, rune(x)), 4
+					}
+				} else if !atEOF {
+					err = transform.ErrShortSrc
+					break
+				}
+			}
+			if dSize = utf8.RuneLen(r); dSize < 0 {
+				r, dSize = utf8.RuneError, 3
+			}
+		} else if atEOF {
+			// Single trailing byte.
+			r, dSize, sSize = utf8.RuneError, 3, 1
+		} else {
+			err = transform.ErrShortSrc
+			break
+		}
+		if nDst+dSize > len(dst) {
+			err = transform.ErrShortDst
+			break
+		}
+		nDst += utf8.EncodeRune(dst[nDst:], r)
+		nSrc += sSize
+	}
+	return nDst, nSrc, err
+}
+
+func isHighSurrogate(r rune) bool {
+	return 0xDC00 <= r && r <= 0xDFFF
+}
+
+type utf16Encoder struct {
+	endianness       Endianness
+	initialBOMPolicy BOMPolicy
+	currentBOMPolicy BOMPolicy
+}
+
+func (u *utf16Encoder) Reset() {
+	u.currentBOMPolicy = u.initialBOMPolicy
+}
+
+func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	if u.currentBOMPolicy&writeBOM != 0 {
+		if len(dst) < 2 {
+			return 0, 0, transform.ErrShortDst
+		}
+		dst[0], dst[1] = 0xfe, 0xff
+		u.currentBOMPolicy = IgnoreBOM
+		nDst = 2
+	}
+
+	r, size := rune(0), 0
+	for nSrc < len(src) {
+		r = rune(src[nSrc])
+
+		// Decode a 1-byte rune.
+		if r < utf8.RuneSelf {
+			size = 1
+
+		} else {
+			// Decode a multi-byte rune.
+			r, size = utf8.DecodeRune(src[nSrc:])
+			if size == 1 {
+				// All valid runes of size 1 (those below utf8.RuneSelf) were
+				// handled above. We have invalid UTF-8 or we haven't seen the
+				// full character yet.
+				if !atEOF && !utf8.FullRune(src[nSrc:]) {
+					err = transform.ErrShortSrc
+					break
+				}
+			}
+		}
+
+		if r <= 0xffff {
+			if nDst+2 > len(dst) {
+				err = transform.ErrShortDst
+				break
+			}
+			dst[nDst+0] = uint8(r >> 8)
+			dst[nDst+1] = uint8(r)
+			nDst += 2
+		} else {
+			if nDst+4 > len(dst) {
+				err = transform.ErrShortDst
+				break
+			}
+			r1, r2 := utf16.EncodeRune(r)
+			dst[nDst+0] = uint8(r1 >> 8)
+			dst[nDst+1] = uint8(r1)
+			dst[nDst+2] = uint8(r2 >> 8)
+			dst[nDst+3] = uint8(r2)
+			nDst += 4
+		}
+		nSrc += size
+	}
+
+	if u.endianness == LittleEndian {
+		for i := 0; i < nDst; i += 2 {
+			dst[i], dst[i+1] = dst[i+1], dst[i]
+		}
+	}
+	return nDst, nSrc, err
+}
diff --git a/vendor/golang.org/x/text/internal/utf8internal/utf8internal.go b/vendor/golang.org/x/text/internal/utf8internal/utf8internal.go
new file mode 100644
index 0000000000..e5c53b1b3e
--- /dev/null
+++ b/vendor/golang.org/x/text/internal/utf8internal/utf8internal.go
@@ -0,0 +1,87 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package utf8internal contains low-level utf8-related constants, tables, etc.
+// that are used internally by the text package.
+package utf8internal
+
+// The default lowest and highest continuation byte.
+const (
+	LoCB = 0x80 // 1000 0000
+	HiCB = 0xBF // 1011 1111
+)
+
+// Constants related to getting information of first bytes of UTF-8 sequences.
+const (
+	// ASCII identifies a UTF-8 byte as ASCII.
+	ASCII = as
+
+	// FirstInvalid indicates a byte is invalid as a first byte of a UTF-8
+	// sequence.
+	FirstInvalid = xx
+
+	// SizeMask is a mask for the size bits. Use use x&SizeMask to get the size.
+	SizeMask = 7
+
+	// AcceptShift is the right-shift count for the first byte info byte to get
+	// the index into the AcceptRanges table. See AcceptRanges.
+	AcceptShift = 4
+
+	// The names of these constants are chosen to give nice alignment in the
+	// table below. The first nibble is an index into acceptRanges or F for
+	// special one-byte cases. The second nibble is the Rune length or the
+	// Status for the special one-byte case.
+	xx = 0xF1 // invalid: size 1
+	as = 0xF0 // ASCII: size 1
+	s1 = 0x02 // accept 0, size 2
+	s2 = 0x13 // accept 1, size 3
+	s3 = 0x03 // accept 0, size 3
+	s4 = 0x23 // accept 2, size 3
+	s5 = 0x34 // accept 3, size 4
+	s6 = 0x04 // accept 0, size 4
+	s7 = 0x44 // accept 4, size 4
+)
+
+// First is information about the first byte in a UTF-8 sequence.
+var First = [256]uint8{
+	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
+	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
+	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
+	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
+	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
+	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
+	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
+	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
+}
+
+// AcceptRange gives the range of valid values for the second byte in a UTF-8
+// sequence for any value for First that is not ASCII or FirstInvalid.
+type AcceptRange struct {
+	Lo uint8 // lowest value for second byte.
+	Hi uint8 // highest value for second byte.
+}
+
+// AcceptRanges is a slice of AcceptRange values. For a given byte sequence b
+//
+//	AcceptRanges[First[b[0]]>>AcceptShift]
+//
+// will give the value of AcceptRange for the multi-byte UTF-8 sequence starting
+// at b[0].
+var AcceptRanges = [...]AcceptRange{
+	0: {LoCB, HiCB},
+	1: {0xA0, HiCB},
+	2: {LoCB, 0x9F},
+	3: {0x90, HiCB},
+	4: {LoCB, 0x8F},
+}
diff --git a/vendor/golang.org/x/text/runes/cond.go b/vendor/golang.org/x/text/runes/cond.go
new file mode 100644
index 0000000000..df7aa02db6
--- /dev/null
+++ b/vendor/golang.org/x/text/runes/cond.go
@@ -0,0 +1,187 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runes
+
+import (
+	"unicode/utf8"
+
+	"golang.org/x/text/transform"
+)
+
+// Note: below we pass invalid UTF-8 to the tIn and tNotIn transformers as is.
+// This is done for various reasons:
+// - To retain the semantics of the Nop transformer: if input is passed to a Nop
+//   one would expect it to be unchanged.
+// - It would be very expensive to pass a converted RuneError to a transformer:
+//   a transformer might need more source bytes after RuneError, meaning that
+//   the only way to pass it safely is to create a new buffer and manage the
+//   intermingling of RuneErrors and normal input.
+// - Many transformers leave ill-formed UTF-8 as is, so this is not
+//   inconsistent. Generally ill-formed UTF-8 is only replaced if it is a
+//   logical consequence of the operation (as for Map) or if it otherwise would
+//   pose security concerns (as for Remove).
+// - An alternative would be to return an error on ill-formed UTF-8, but this
+//   would be inconsistent with other operations.
+
+// If returns a transformer that applies tIn to consecutive runes for which
+// s.Contains(r) and tNotIn to consecutive runes for which !s.Contains(r). Reset
+// is called on tIn and tNotIn at the start of each run. A Nop transformer will
+// substitute a nil value passed to tIn or tNotIn. Invalid UTF-8 is translated
+// to RuneError to determine which transformer to apply, but is passed as is to
+// the respective transformer.
+func If(s Set, tIn, tNotIn transform.Transformer) Transformer {
+	if tIn == nil && tNotIn == nil {
+		return Transformer{transform.Nop}
+	}
+	if tIn == nil {
+		tIn = transform.Nop
+	}
+	if tNotIn == nil {
+		tNotIn = transform.Nop
+	}
+	sIn, ok := tIn.(transform.SpanningTransformer)
+	if !ok {
+		sIn = dummySpan{tIn}
+	}
+	sNotIn, ok := tNotIn.(transform.SpanningTransformer)
+	if !ok {
+		sNotIn = dummySpan{tNotIn}
+	}
+
+	a := &cond{
+		tIn:    sIn,
+		tNotIn: sNotIn,
+		f:      s.Contains,
+	}
+	a.Reset()
+	return Transformer{a}
+}
+
+type dummySpan struct{ transform.Transformer }
+
+func (d dummySpan) Span(src []byte, atEOF bool) (n int, err error) {
+	return 0, transform.ErrEndOfSpan
+}
+
+type cond struct {
+	tIn, tNotIn transform.SpanningTransformer
+	f           func(rune) bool
+	check       func(rune) bool               // current check to perform
+	t           transform.SpanningTransformer // current transformer to use
+}
+
+// Reset implements transform.Transformer.
+func (t *cond) Reset() {
+	t.check = t.is
+	t.t = t.tIn
+	t.t.Reset() // notIn will be reset on first usage.
+}
+
+func (t *cond) is(r rune) bool {
+	if t.f(r) {
+		return true
+	}
+	t.check = t.isNot
+	t.t = t.tNotIn
+	t.tNotIn.Reset()
+	return false
+}
+
+func (t *cond) isNot(r rune) bool {
+	if !t.f(r) {
+		return true
+	}
+	t.check = t.is
+	t.t = t.tIn
+	t.tIn.Reset()
+	return false
+}
+
+// This implementation of Span doesn't help all too much, but it needs to be
+// there to satisfy this package's Transformer interface.
+// TODO: there are certainly room for improvements, though. For example, if
+// t.t == transform.Nop (which will a common occurrence) it will save a bundle
+// to special-case that loop.
+func (t *cond) Span(src []byte, atEOF bool) (n int, err error) {
+	p := 0
+	for n < len(src) && err == nil {
+		// Don't process too much at a time as the Spanner that will be
+		// called on this block may terminate early.
+		const maxChunk = 4096
+		max := len(src)
+		if v := n + maxChunk; v < max {
+			max = v
+		}
+		atEnd := false
+		size := 0
+		current := t.t
+		for ; p < max; p += size {
+			r := rune(src[p])
+			if r < utf8.RuneSelf {
+				size = 1
+			} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
+				if !atEOF && !utf8.FullRune(src[p:]) {
+					err = transform.ErrShortSrc
+					break
+				}
+			}
+			if !t.check(r) {
+				// The next rune will be the start of a new run.
+				atEnd = true
+				break
+			}
+		}
+		n2, err2 := current.Span(src[n:p], atEnd || (atEOF && p == len(src)))
+		n += n2
+		if err2 != nil {
+			return n, err2
+		}
+		// At this point either err != nil or t.check will pass for the rune at p.
+		p = n + size
+	}
+	return n, err
+}
+
+func (t *cond) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	p := 0
+	for nSrc < len(src) && err == nil {
+		// Don't process too much at a time, as the work might be wasted if the
+		// destination buffer isn't large enough to hold the result or a
+		// transform returns an error early.
+		const maxChunk = 4096
+		max := len(src)
+		if n := nSrc + maxChunk; n < len(src) {
+			max = n
+		}
+		atEnd := false
+		size := 0
+		current := t.t
+		for ; p < max; p += size {
+			r := rune(src[p])
+			if r < utf8.RuneSelf {
+				size = 1
+			} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
+				if !atEOF && !utf8.FullRune(src[p:]) {
+					err = transform.ErrShortSrc
+					break
+				}
+			}
+			if !t.check(r) {
+				// The next rune will be the start of a new run.
+				atEnd = true
+				break
+			}
+		}
+		nDst2, nSrc2, err2 := current.Transform(dst[nDst:], src[nSrc:p], atEnd || (atEOF && p == len(src)))
+		nDst += nDst2
+		nSrc += nSrc2
+		if err2 != nil {
+			return nDst, nSrc, err2
+		}
+		// At this point either err != nil or t.check will pass for the rune at p.
+		p = nSrc + size
+	}
+	return nDst, nSrc, err
+}
diff --git a/vendor/golang.org/x/text/runes/runes.go b/vendor/golang.org/x/text/runes/runes.go
new file mode 100644
index 0000000000..930e87fedb
--- /dev/null
+++ b/vendor/golang.org/x/text/runes/runes.go
@@ -0,0 +1,355 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package runes provide transforms for UTF-8 encoded text.
+package runes // import "golang.org/x/text/runes"
+
+import (
+	"unicode"
+	"unicode/utf8"
+
+	"golang.org/x/text/transform"
+)
+
+// A Set is a collection of runes.
+type Set interface {
+	// Contains returns true if r is contained in the set.
+	Contains(r rune) bool
+}
+
+type setFunc func(rune) bool
+
+func (s setFunc) Contains(r rune) bool {
+	return s(r)
+}
+
+// Note: using funcs here instead of wrapping types result in cleaner
+// documentation and a smaller API.
+
+// In creates a Set with a Contains method that returns true for all runes in
+// the given RangeTable.
+func In(rt *unicode.RangeTable) Set {
+	return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
+}
+
+// NotIn creates a Set with a Contains method that returns true for all runes not
+// in the given RangeTable.
+func NotIn(rt *unicode.RangeTable) Set {
+	return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
+}
+
+// Predicate creates a Set with a Contains method that returns f(r).
+func Predicate(f func(rune) bool) Set {
+	return setFunc(f)
+}
+
+// Transformer implements the transform.Transformer interface.
+type Transformer struct {
+	t transform.SpanningTransformer
+}
+
+func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	return t.t.Transform(dst, src, atEOF)
+}
+
+func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
+	return t.t.Span(b, atEOF)
+}
+
+func (t Transformer) Reset() { t.t.Reset() }
+
+// Bytes returns a new byte slice with the result of converting b using t.  It
+// calls Reset on t. It returns nil if any error was found. This can only happen
+// if an error-producing Transformer is passed to If.
+func (t Transformer) Bytes(b []byte) []byte {
+	b, _, err := transform.Bytes(t, b)
+	if err != nil {
+		return nil
+	}
+	return b
+}
+
+// String returns a string with the result of converting s using t. It calls
+// Reset on t. It returns the empty string if any error was found. This can only
+// happen if an error-producing Transformer is passed to If.
+func (t Transformer) String(s string) string {
+	s, _, err := transform.String(t, s)
+	if err != nil {
+		return ""
+	}
+	return s
+}
+
+// TODO:
+// - Copy: copying strings and bytes in whole-rune units.
+// - Validation (maybe)
+// - Well-formed-ness (maybe)
+
+const runeErrorString = string(utf8.RuneError)
+
+// Remove returns a Transformer that removes runes r for which s.Contains(r).
+// Illegal input bytes are replaced by RuneError before being passed to f.
+func Remove(s Set) Transformer {
+	if f, ok := s.(setFunc); ok {
+		// This little trick cuts the running time of BenchmarkRemove for sets
+		// created by Predicate roughly in half.
+		// TODO: special-case RangeTables as well.
+		return Transformer{remove(f)}
+	}
+	return Transformer{remove(s.Contains)}
+}
+
+// TODO: remove transform.RemoveFunc.
+
+type remove func(r rune) bool
+
+func (remove) Reset() {}
+
+// Span implements transform.Spanner.
+func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
+	for r, size := rune(0), 0; n < len(src); {
+		if r = rune(src[n]); r < utf8.RuneSelf {
+			size = 1
+		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
+			// Invalid rune.
+			if !atEOF && !utf8.FullRune(src[n:]) {
+				err = transform.ErrShortSrc
+			} else {
+				err = transform.ErrEndOfSpan
+			}
+			break
+		}
+		if t(r) {
+			err = transform.ErrEndOfSpan
+			break
+		}
+		n += size
+	}
+	return
+}
+
+// Transform implements transform.Transformer.
+func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	for r, size := rune(0), 0; nSrc < len(src); {
+		if r = rune(src[nSrc]); r < utf8.RuneSelf {
+			size = 1
+		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
+			// Invalid rune.
+			if !atEOF && !utf8.FullRune(src[nSrc:]) {
+				err = transform.ErrShortSrc
+				break
+			}
+			// We replace illegal bytes with RuneError. Not doing so might
+			// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
+			// The resulting byte sequence may subsequently contain runes
+			// for which t(r) is true that were passed unnoticed.
+			if !t(utf8.RuneError) {
+				if nDst+3 > len(dst) {
+					err = transform.ErrShortDst
+					break
+				}
+				dst[nDst+0] = runeErrorString[0]
+				dst[nDst+1] = runeErrorString[1]
+				dst[nDst+2] = runeErrorString[2]
+				nDst += 3
+			}
+			nSrc++
+			continue
+		}
+		if t(r) {
+			nSrc += size
+			continue
+		}
+		if nDst+size > len(dst) {
+			err = transform.ErrShortDst
+			break
+		}
+		for i := 0; i < size; i++ {
+			dst[nDst] = src[nSrc]
+			nDst++
+			nSrc++
+		}
+	}
+	return
+}
+
+// Map returns a Transformer that maps the runes in the input using the given
+// mapping. Illegal bytes in the input are converted to utf8.RuneError before
+// being passed to the mapping func.
+func Map(mapping func(rune) rune) Transformer {
+	return Transformer{mapper(mapping)}
+}
+
+type mapper func(rune) rune
+
+func (mapper) Reset() {}
+
+// Span implements transform.Spanner.
+func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
+	for r, size := rune(0), 0; n < len(src); n += size {
+		if r = rune(src[n]); r < utf8.RuneSelf {
+			size = 1
+		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
+			// Invalid rune.
+			if !atEOF && !utf8.FullRune(src[n:]) {
+				err = transform.ErrShortSrc
+			} else {
+				err = transform.ErrEndOfSpan
+			}
+			break
+		}
+		if t(r) != r {
+			err = transform.ErrEndOfSpan
+			break
+		}
+	}
+	return n, err
+}
+
+// Transform implements transform.Transformer.
+func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	var replacement rune
+	var b [utf8.UTFMax]byte
+
+	for r, size := rune(0), 0; nSrc < len(src); {
+		if r = rune(src[nSrc]); r < utf8.RuneSelf {
+			if replacement = t(r); replacement < utf8.RuneSelf {
+				if nDst == len(dst) {
+					err = transform.ErrShortDst
+					break
+				}
+				dst[nDst] = byte(replacement)
+				nDst++
+				nSrc++
+				continue
+			}
+			size = 1
+		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
+			// Invalid rune.
+			if !atEOF && !utf8.FullRune(src[nSrc:]) {
+				err = transform.ErrShortSrc
+				break
+			}
+
+			if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
+				if nDst+3 > len(dst) {
+					err = transform.ErrShortDst
+					break
+				}
+				dst[nDst+0] = runeErrorString[0]
+				dst[nDst+1] = runeErrorString[1]
+				dst[nDst+2] = runeErrorString[2]
+				nDst += 3
+				nSrc++
+				continue
+			}
+		} else if replacement = t(r); replacement == r {
+			if nDst+size > len(dst) {
+				err = transform.ErrShortDst
+				break
+			}
+			for i := 0; i < size; i++ {
+				dst[nDst] = src[nSrc]
+				nDst++
+				nSrc++
+			}
+			continue
+		}
+
+		n := utf8.EncodeRune(b[:], replacement)
+
+		if nDst+n > len(dst) {
+			err = transform.ErrShortDst
+			break
+		}
+		for i := 0; i < n; i++ {
+			dst[nDst] = b[i]
+			nDst++
+		}
+		nSrc += size
+	}
+	return
+}
+
+// ReplaceIllFormed returns a transformer that replaces all input bytes that are
+// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
+func ReplaceIllFormed() Transformer {
+	return Transformer{&replaceIllFormed{}}
+}
+
+type replaceIllFormed struct{ transform.NopResetter }
+
+func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
+	for n < len(src) {
+		// ASCII fast path.
+		if src[n] < utf8.RuneSelf {
+			n++
+			continue
+		}
+
+		r, size := utf8.DecodeRune(src[n:])
+
+		// Look for a valid non-ASCII rune.
+		if r != utf8.RuneError || size != 1 {
+			n += size
+			continue
+		}
+
+		// Look for short source data.
+		if !atEOF && !utf8.FullRune(src[n:]) {
+			err = transform.ErrShortSrc
+			break
+		}
+
+		// We have an invalid rune.
+		err = transform.ErrEndOfSpan
+		break
+	}
+	return n, err
+}
+
+func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	for nSrc < len(src) {
+		// ASCII fast path.
+		if r := src[nSrc]; r < utf8.RuneSelf {
+			if nDst == len(dst) {
+				err = transform.ErrShortDst
+				break
+			}
+			dst[nDst] = r
+			nDst++
+			nSrc++
+			continue
+		}
+
+		// Look for a valid non-ASCII rune.
+		if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
+			if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
+				err = transform.ErrShortDst
+				break
+			}
+			nDst += size
+			nSrc += size
+			continue
+		}
+
+		// Look for short source data.
+		if !atEOF && !utf8.FullRune(src[nSrc:]) {
+			err = transform.ErrShortSrc
+			break
+		}
+
+		// We have an invalid rune.
+		if nDst+3 > len(dst) {
+			err = transform.ErrShortDst
+			break
+		}
+		dst[nDst+0] = runeErrorString[0]
+		dst[nDst+1] = runeErrorString[1]
+		dst[nDst+2] = runeErrorString[2]
+		nDst += 3
+		nSrc++
+	}
+	return nDst, nSrc, err
+}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index cd57a35508..ed5b8f393a 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -67,6 +67,9 @@ github.com/PuerkitoBio/purell
 # github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578
 ## explicit
 github.com/PuerkitoBio/urlesc
+# github.com/STARRY-S/zip v0.2.1
+## explicit; go 1.22.2
+github.com/STARRY-S/zip
 # github.com/VividCortex/ewma v1.2.0
 ## explicit; go 1.12
 github.com/VividCortex/ewma
@@ -84,9 +87,10 @@ github.com/alecthomas/participle/v2/lexer
 ## explicit
 github.com/alecthomas/template
 github.com/alecthomas/template/parse
-# github.com/andybalholm/brotli v1.0.1
-## explicit; go 1.12
+# github.com/andybalholm/brotli v1.1.2-0.20250424173009-453214e765f3
+## explicit; go 1.13
 github.com/andybalholm/brotli
+github.com/andybalholm/brotli/matchfinder
 # github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d
 ## explicit; go 1.13
 github.com/asaskevich/govalidator
@@ -225,6 +229,28 @@ github.com/blang/semver
 # github.com/bmatcuk/doublestar/v4 v4.7.1
 ## explicit; go 1.16
 github.com/bmatcuk/doublestar/v4
+# github.com/bodgit/plumbing v1.3.0
+## explicit; go 1.13
+github.com/bodgit/plumbing
+# github.com/bodgit/sevenzip v1.6.0
+## explicit; go 1.19
+github.com/bodgit/sevenzip
+github.com/bodgit/sevenzip/internal/aes7z
+github.com/bodgit/sevenzip/internal/bcj2
+github.com/bodgit/sevenzip/internal/bra
+github.com/bodgit/sevenzip/internal/brotli
+github.com/bodgit/sevenzip/internal/bzip2
+github.com/bodgit/sevenzip/internal/deflate
+github.com/bodgit/sevenzip/internal/delta
+github.com/bodgit/sevenzip/internal/lz4
+github.com/bodgit/sevenzip/internal/lzma
+github.com/bodgit/sevenzip/internal/lzma2
+github.com/bodgit/sevenzip/internal/pool
+github.com/bodgit/sevenzip/internal/util
+github.com/bodgit/sevenzip/internal/zstd
+# github.com/bodgit/windows v1.0.1
+## explicit; go 1.13
+github.com/bodgit/windows
 # github.com/brunoga/deep v1.2.4
 ## explicit; go 1.20.0
 github.com/brunoga/deep
@@ -270,7 +296,7 @@ github.com/dave/jennifer/jen
 # github.com/davecgh/go-spew v1.1.1
 ## explicit
 github.com/davecgh/go-spew/spew
-# github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5
+# github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707
 ## explicit; go 1.9
 github.com/dsnet/compress
 github.com/dsnet/compress/bzip2
@@ -415,9 +441,6 @@ github.com/gofrs/flock
 # github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da
 ## explicit
 github.com/golang/groupcache/lru
-# github.com/golang/snappy v0.0.4
-## explicit
-github.com/golang/snappy
 # github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99
 ## explicit; go 1.14
 github.com/google/pprof/profile
@@ -430,9 +453,15 @@ github.com/gorilla/websocket
 # github.com/gowebpki/jcs v1.0.1
 ## explicit; go 1.15
 github.com/gowebpki/jcs
+# github.com/hashicorp/errwrap v1.1.0
+## explicit
+github.com/hashicorp/errwrap
 # github.com/hashicorp/go-cleanhttp v0.5.2
 ## explicit; go 1.13
 github.com/hashicorp/go-cleanhttp
+# github.com/hashicorp/go-multierror v1.1.1
+## explicit; go 1.13
+github.com/hashicorp/go-multierror
 # github.com/hashicorp/go-retryablehttp v0.7.7
 ## explicit; go 1.19
 github.com/hashicorp/go-retryablehttp
@@ -481,17 +510,23 @@ github.com/kballard/go-shellquote
 # github.com/kevinburke/ssh_config v1.2.0
 ## explicit
 github.com/kevinburke/ssh_config
-# github.com/klauspost/compress v1.11.4
-## explicit; go 1.13
+# github.com/klauspost/compress v1.17.11
+## explicit; go 1.21
+github.com/klauspost/compress
 github.com/klauspost/compress/flate
 github.com/klauspost/compress/fse
 github.com/klauspost/compress/gzip
 github.com/klauspost/compress/huff0
-github.com/klauspost/compress/snappy
+github.com/klauspost/compress/internal/cpuinfo
+github.com/klauspost/compress/internal/godebug
+github.com/klauspost/compress/internal/race
+github.com/klauspost/compress/internal/snapref
+github.com/klauspost/compress/s2
 github.com/klauspost/compress/zip
+github.com/klauspost/compress/zlib
 github.com/klauspost/compress/zstd
 github.com/klauspost/compress/zstd/internal/xxhash
-# github.com/klauspost/pgzip v1.2.5
+# github.com/klauspost/pgzip v1.2.6
 ## explicit
 github.com/klauspost/pgzip
 # github.com/kr/pty v1.1.8
@@ -548,9 +583,16 @@ github.com/mattn/go-runewidth
 # github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d
 ## explicit
 github.com/mgutz/ansi
-# github.com/mholt/archiver/v3 v3.5.1
-## explicit; go 1.13
-github.com/mholt/archiver/v3
+# github.com/mholt/archives v0.1.3
+## explicit; go 1.22.2
+github.com/mholt/archives
+# github.com/mikelolasagasti/xz v1.0.1
+## explicit; go 1.15
+github.com/mikelolasagasti/xz
+# github.com/minio/minlz v1.0.0
+## explicit; go 1.21
+github.com/minio/minlz
+github.com/minio/minlz/internal/race
 # github.com/mitchellh/mapstructure v1.5.0
 ## explicit; go 1.14
 github.com/mitchellh/mapstructure
@@ -579,9 +621,9 @@ github.com/nicksnyder/go-i18n/i18n/translation
 # github.com/nu7hatch/gouuid v0.0.0-20131221200532-179d4d0c4d8d
 ## explicit
 github.com/nu7hatch/gouuid
-# github.com/nwaples/rardecode v1.1.3
-## explicit
-github.com/nwaples/rardecode
+# github.com/nwaples/rardecode/v2 v2.1.0
+## explicit; go 1.21
+github.com/nwaples/rardecode/v2
 # github.com/oklog/ulid v1.3.1
 ## explicit
 github.com/oklog/ulid
@@ -599,7 +641,7 @@ github.com/pelletier/go-toml
 # github.com/phayes/permbits v0.0.0-20190108233746-1efae4548023
 ## explicit
 github.com/phayes/permbits
-# github.com/pierrec/lz4/v4 v4.1.2
+# github.com/pierrec/lz4/v4 v4.1.21
 ## explicit; go 1.14
 github.com/pierrec/lz4/v4
 github.com/pierrec/lz4/v4/internal/lz4block
@@ -652,6 +694,9 @@ github.com/skeema/knownhosts
 # github.com/skratchdot/open-golang v0.0.0-20190104022628-a2dfa6d0dab6
 ## explicit
 github.com/skratchdot/open-golang/open
+# github.com/sorairolake/lzip-go v0.3.5
+## explicit; go 1.22
+github.com/sorairolake/lzip-go
 # github.com/sosodev/duration v1.3.1
 ## explicit; go 1.17
 github.com/sosodev/duration
@@ -710,9 +755,6 @@ github.com/vektah/gqlparser/v2/validator/rules
 # github.com/xanzy/ssh-agent v0.3.3
 ## explicit; go 1.16
 github.com/xanzy/ssh-agent
-# github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8
-## explicit
-github.com/xi2/xz
 # github.com/yosida95/uritemplate/v3 v3.0.2
 ## explicit; go 1.14
 github.com/yosida95/uritemplate/v3
@@ -731,6 +773,10 @@ go.mongodb.org/mongo-driver/bson/bsonrw
 go.mongodb.org/mongo-driver/bson/bsontype
 go.mongodb.org/mongo-driver/bson/primitive
 go.mongodb.org/mongo-driver/x/bsonx/bsoncore
+# go4.org v0.0.0-20230225012048-214862532bf5
+## explicit; go 1.13
+go4.org/readerutil
+go4.org/syncutil
 # golang.org/x/crypto v0.41.0
 ## explicit; go 1.23.0
 golang.org/x/crypto/acme
@@ -782,11 +828,17 @@ golang.org/x/term
 # golang.org/x/text v0.28.0
 ## explicit; go 1.23.0
 golang.org/x/text/cases
+golang.org/x/text/encoding
+golang.org/x/text/encoding/internal
+golang.org/x/text/encoding/internal/identifier
+golang.org/x/text/encoding/unicode
 golang.org/x/text/internal
 golang.org/x/text/internal/language
 golang.org/x/text/internal/language/compact
 golang.org/x/text/internal/tag
+golang.org/x/text/internal/utf8internal
 golang.org/x/text/language
+golang.org/x/text/runes
 golang.org/x/text/secure/bidirule
 golang.org/x/text/transform
 golang.org/x/text/unicode/bidi