From: Adrian Schmutzler Date: Wed, 24 Feb 2021 10:53:03 +0000 (+0100) Subject: zlib: properly split patches X-Git-Url: http://git.cdn.openwrt.org/?a=commitdiff_plain;h=221eefaf6b301043c491aab8815fcfa24e8a5583;p=openwrt%2Fstaging%2Fdangole.git zlib: properly split patches This package had two patches (with two headers etc.) in one file, which would have quilt merging them during a refresh. Separate these patches into two files, as the original intent seems to be having them separate. Signed-off-by: Adrian Schmutzler --- diff --git a/package/libs/zlib/patches/002-arm-specific-optimisations-for-inflate.patch b/package/libs/zlib/patches/002-arm-specific-optimisations-for-inflate.patch index 475ed6f3e9..d181b034e5 100644 --- a/package/libs/zlib/patches/002-arm-specific-optimisations-for-inflate.patch +++ b/package/libs/zlib/patches/002-arm-specific-optimisations-for-inflate.patch @@ -1907,505 +1907,3 @@ index 00000000..ac333e8c + state = (struct inflate_state FAR *)strm->state; + return (unsigned long)(state->next - state->codes); +} - -From 247147654fe5cd11cf15d8dff91440405ea57040 Mon Sep 17 00:00:00 2001 -From: Simon Hosie -Date: Wed, 12 Apr 2017 15:44:21 -0700 -Subject: [PATCH 2/2] Inflate using wider loads and stores - -In inflate_fast() the output pointer always has plenty of room to write. This -means that so long as the target is capable, wide un-aligned loads and stores -can be used to transfer several bytes at once. When the reference distance is -too short simply unroll the data a little to increase the distance. - -Change-Id: I59854eb25d2b1e43561c8a2afaf9175bf10cf674 ---- - contrib/arm/chunkcopy.h | 279 ++++++++++++++++++++++++++++++++++++++++++++++++ - contrib/arm/inffast.c | 96 +++++++---------- - contrib/arm/inflate.c | 22 ++-- - 3 files changed, 335 insertions(+), 62 deletions(-) - create mode 100644 contrib/arm/chunkcopy.h - -diff --git a/contrib/arm/chunkcopy.h b/contrib/arm/chunkcopy.h -new file mode 100644 -index 00000000..2d6fd6f9 ---- /dev/null -+++ b/contrib/arm/chunkcopy.h -@@ -0,0 +1,279 @@ -+/* chunkcopy.h -- fast copies and sets -+ * Copyright (C) 2017 ARM, Inc. -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+#ifndef CHUNKCOPY_H -+#define CHUNKCOPY_H -+ -+#include "zutil.h" -+#include -+ -+#if __STDC_VERSION__ >= 199901L -+#define Z_RESTRICT restrict -+#else -+#define Z_RESTRICT -+#endif -+ -+typedef uint8x16_t chunkcopy_chunk_t; -+#define CHUNKCOPY_CHUNK_SIZE sizeof(chunkcopy_chunk_t) -+ -+/* -+ Ask the compiler to perform a wide, unaligned load with an machine -+ instruction appropriate for the chunkcopy_chunk_t type. -+ */ -+static inline chunkcopy_chunk_t loadchunk(const unsigned char FAR *s) { -+ chunkcopy_chunk_t c; -+ __builtin_memcpy(&c, s, sizeof(c)); -+ return c; -+} -+ -+/* -+ Ask the compiler to perform a wide, unaligned store with an machine -+ instruction appropriate for the chunkcopy_chunk_t type. -+ */ -+static inline void storechunk(unsigned char FAR *d, chunkcopy_chunk_t c) { -+ __builtin_memcpy(d, &c, sizeof(c)); -+} -+ -+/* -+ Perform a memcpy-like operation, but assume that length is non-zero and that -+ it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if -+ the length is shorter than this. -+ -+ It also guarantees that it will properly unroll the data if the distance -+ between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on -+ in chunkcopy_relaxed(). -+ -+ Aside from better memory bus utilisation, this means that short copies -+ (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop -+ without iteration, which will hopefully make the branch prediction more -+ reliable. -+ */ -+static inline unsigned char FAR *chunkcopy_core(unsigned char FAR *out, -+ const unsigned char FAR *from, -+ unsigned len) { -+ int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1; -+ storechunk(out, loadchunk(from)); -+ out += bump; -+ from += bump; -+ len /= CHUNKCOPY_CHUNK_SIZE; -+ while (len-- > 0) { -+ storechunk(out, loadchunk(from)); -+ out += CHUNKCOPY_CHUNK_SIZE; -+ from += CHUNKCOPY_CHUNK_SIZE; -+ } -+ return out; -+} -+ -+/* -+ Like chunkcopy_core, but avoid writing beyond of legal output. -+ -+ Accepts an additional pointer to the end of safe output. A generic safe -+ copy would use (out + len), but it's normally the case that the end of the -+ output buffer is beyond the end of the current copy, and this can still be -+ exploited. -+ */ -+static inline unsigned char FAR *chunkcopy_core_safe(unsigned char FAR *out, -+ const unsigned char FAR * from, -+ unsigned len, -+ unsigned char FAR *limit) { -+ Assert(out + len <= limit, "chunk copy exceeds safety limit"); -+ if (limit - out < CHUNKCOPY_CHUNK_SIZE) { -+ const unsigned char FAR * Z_RESTRICT rfrom = from; -+ if (len & 8) { __builtin_memcpy(out, rfrom, 8); out += 8; rfrom += 8; } -+ if (len & 4) { __builtin_memcpy(out, rfrom, 4); out += 4; rfrom += 4; } -+ if (len & 2) { __builtin_memcpy(out, rfrom, 2); out += 2; rfrom += 2; } -+ if (len & 1) { *out++ = *rfrom++; } -+ return out; -+ } -+ return chunkcopy_core(out, from, len); -+} -+ -+/* -+ Perform short copies until distance can be rewritten as being at least -+ CHUNKCOPY_CHUNK_SIZE. -+ -+ This assumes that it's OK to overwrite at least the first -+ 2*CHUNKCOPY_CHUNK_SIZE bytes of output even if the copy is shorter than -+ this. This assumption holds within inflate_fast() which starts every -+ iteration with at least 258 bytes of output space available (258 being the -+ maximum length output from a single token; see inffast.c). -+ */ -+static inline unsigned char FAR *chunkunroll_relaxed(unsigned char FAR *out, -+ unsigned FAR *dist, -+ unsigned FAR *len) { -+ const unsigned char FAR *from = out - *dist; -+ while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) { -+ storechunk(out, loadchunk(from)); -+ out += *dist; -+ *len -= *dist; -+ *dist += *dist; -+ } -+ return out; -+} -+ -+ -+static inline uint8x16_t chunkset_vld1q_dup_u8x8(const unsigned char FAR * Z_RESTRICT from) { -+#if defined(__clang__) || defined(__aarch64__) -+ return vreinterpretq_u8_u64(vld1q_dup_u64((void *)from)); -+#else -+ /* 32-bit GCC uses an alignment hint for vld1q_dup_u64, even when given a -+ * void pointer, so here's an alternate implementation. -+ */ -+ uint8x8_t h = vld1_u8(from); -+ return vcombine_u8(h, h); -+#endif -+} -+ -+/* -+ Perform an overlapping copy which behaves as a memset() operation, but -+ supporting periods other than one, and assume that length is non-zero and -+ that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output -+ even if the length is shorter than this. -+ */ -+static inline unsigned char FAR *chunkset_core(unsigned char FAR *out, -+ unsigned period, -+ unsigned len) { -+ uint8x16_t f; -+ int bump = ((len - 1) % sizeof(f)) + 1; -+ -+ switch (period) { -+ case 1: -+ f = vld1q_dup_u8(out - 1); -+ vst1q_u8(out, f); -+ out += bump; -+ len -= bump; -+ while (len > 0) { -+ vst1q_u8(out, f); -+ out += sizeof(f); -+ len -= sizeof(f); -+ } -+ return out; -+ case 2: -+ f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2))); -+ vst1q_u8(out, f); -+ out += bump; -+ len -= bump; -+ if (len > 0) { -+ f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2))); -+ do { -+ vst1q_u8(out, f); -+ out += sizeof(f); -+ len -= sizeof(f); -+ } while (len > 0); -+ } -+ return out; -+ case 4: -+ f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4))); -+ vst1q_u8(out, f); -+ out += bump; -+ len -= bump; -+ if (len > 0) { -+ f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4))); -+ do { -+ vst1q_u8(out, f); -+ out += sizeof(f); -+ len -= sizeof(f); -+ } while (len > 0); -+ } -+ return out; -+ case 8: -+ f = chunkset_vld1q_dup_u8x8(out - 8); -+ vst1q_u8(out, f); -+ out += bump; -+ len -= bump; -+ if (len > 0) { -+ f = chunkset_vld1q_dup_u8x8(out - 8); -+ do { -+ vst1q_u8(out, f); -+ out += sizeof(f); -+ len -= sizeof(f); -+ } while (len > 0); -+ } -+ return out; -+ } -+ out = chunkunroll_relaxed(out, &period, &len); -+ return chunkcopy_core(out, out - period, len); -+} -+ -+/* -+ Perform a memcpy-like operation, but assume that length is non-zero and that -+ it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if -+ the length is shorter than this. -+ -+ Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour -+ of overlapping buffers, regardless of the distance between the pointers. -+ This is reflected in the `restrict`-qualified pointers, allowing the -+ compiler to reorder loads and stores. -+ */ -+static inline unsigned char FAR *chunkcopy_relaxed(unsigned char FAR * Z_RESTRICT out, -+ const unsigned char FAR * Z_RESTRICT from, -+ unsigned len) { -+ return chunkcopy_core(out, from, len); -+} -+ -+/* -+ Like chunkcopy_relaxed, but avoid writing beyond of legal output. -+ -+ Unlike chunkcopy_core_safe() above, no guarantee is made regarding the -+ behaviour of overlapping buffers, regardless of the distance between the -+ pointers. This is reflected in the `restrict`-qualified pointers, allowing -+ the compiler to reorder loads and stores. -+ -+ Accepts an additional pointer to the end of safe output. A generic safe -+ copy would use (out + len), but it's normally the case that the end of the -+ output buffer is beyond the end of the current copy, and this can still be -+ exploited. -+ */ -+static inline unsigned char FAR *chunkcopy_safe(unsigned char FAR *out, -+ const unsigned char FAR * Z_RESTRICT from, -+ unsigned len, -+ unsigned char FAR *limit) { -+ Assert(out + len <= limit, "chunk copy exceeds safety limit"); -+ return chunkcopy_core_safe(out, from, len, limit); -+} -+ -+/* -+ Perform chunky copy within the same buffer, where the source and destination -+ may potentially overlap. -+ -+ Assumes that len > 0 on entry, and that it's safe to write at least -+ CHUNKCOPY_CHUNK_SIZE*3 bytes to the output. -+ */ -+static inline unsigned char FAR *chunkcopy_lapped_relaxed(unsigned char FAR *out, -+ unsigned dist, -+ unsigned len) { -+ if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) { -+ return chunkset_core(out, dist, len); -+ } -+ return chunkcopy_core(out, out - dist, len); -+} -+ -+/* -+ Behave like chunkcopy_lapped_relaxed, but avoid writing beyond of legal output. -+ -+ Accepts an additional pointer to the end of safe output. A generic safe -+ copy would use (out + len), but it's normally the case that the end of the -+ output buffer is beyond the end of the current copy, and this can still be -+ exploited. -+ */ -+static inline unsigned char FAR *chunkcopy_lapped_safe(unsigned char FAR *out, -+ unsigned dist, -+ unsigned len, -+ unsigned char FAR *limit) { -+ Assert(out + len <= limit, "chunk copy exceeds safety limit"); -+ if (limit - out < CHUNKCOPY_CHUNK_SIZE * 3) { -+ /* TODO: try harder to optimise this */ -+ while (len-- > 0) { -+ *out = *(out - dist); -+ out++; -+ } -+ return out; -+ } -+ return chunkcopy_lapped_relaxed(out, dist, len); -+} -+ -+#undef Z_RESTRICT -+ -+#endif /* CHUNKCOPY_H */ -diff --git a/contrib/arm/inffast.c b/contrib/arm/inffast.c -index 0dbd1dbc..f7f50071 100644 ---- a/contrib/arm/inffast.c -+++ b/contrib/arm/inffast.c -@@ -7,6 +7,7 @@ - #include "inftrees.h" - #include "inflate.h" - #include "inffast.h" -+#include "chunkcopy.h" - - #ifdef ASMINF - # pragma message("Assembler code may have bugs -- use at your own risk") -@@ -57,6 +58,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ - unsigned char FAR *out; /* local strm->next_out */ - unsigned char FAR *beg; /* inflate()'s initial strm->next_out */ - unsigned char FAR *end; /* while out < end, enough space available */ -+ unsigned char FAR *limit; /* safety limit for chunky copies */ - #ifdef INFLATE_STRICT - unsigned dmax; /* maximum distance from zlib header */ - #endif -@@ -84,12 +86,13 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ - out = strm->next_out; - beg = out - (start - strm->avail_out); - end = out + (strm->avail_out - 257); -+ limit = out + strm->avail_out; - #ifdef INFLATE_STRICT - dmax = state->dmax; - #endif - wsize = state->wsize; - whave = state->whave; -- wnext = state->wnext; -+ wnext = (state->wnext == 0 && whave >= wsize) ? wsize : state->wnext; - window = state->window; - hold = state->hold; - bits = state->bits; -@@ -197,70 +200,51 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ - #endif - } - from = window; -- if (wnext == 0) { /* very common case */ -- from += wsize - op; -- if (op < len) { /* some from window */ -- len -= op; -- do { -- *out++ = *from++; -- } while (--op); -- from = out - dist; /* rest from output */ -- } -+ if (wnext >= op) { /* contiguous in window */ -+ from += wnext - op; - } -- else if (wnext < op) { /* wrap around window */ -- from += wsize + wnext - op; -+ else { /* wrap around window */ - op -= wnext; -+ from += wsize - op; - if (op < len) { /* some from end of window */ - len -= op; -- do { -- *out++ = *from++; -- } while (--op); -- from = window; -- if (wnext < len) { /* some from start of window */ -- op = wnext; -- len -= op; -- do { -- *out++ = *from++; -- } while (--op); -- from = out - dist; /* rest from output */ -- } -+ out = chunkcopy_safe(out, from, op, limit); -+ from = window; /* more from start of window */ -+ op = wnext; -+ /* This (rare) case can create a situation where -+ the first chunkcopy below must be checked. -+ */ - } - } -- else { /* contiguous in window */ -- from += wnext - op; -- if (op < len) { /* some from window */ -- len -= op; -- do { -- *out++ = *from++; -- } while (--op); -- from = out - dist; /* rest from output */ -- } -- } -- while (len > 2) { -- *out++ = *from++; -- *out++ = *from++; -- *out++ = *from++; -- len -= 3; -- } -- if (len) { -- *out++ = *from++; -- if (len > 1) -- *out++ = *from++; -+ if (op < len) { /* still need some from output */ -+ out = chunkcopy_safe(out, from, op, limit); -+ len -= op; -+ /* When dist is small the amount of data that can be -+ copied from the window is also small, and progress -+ towards the dangerous end of the output buffer is -+ also small. This means that for trivial memsets and -+ for chunkunroll_relaxed() a safety check is -+ unnecessary. However, these conditions may not be -+ entered at all, and in that case it's possible that -+ the main copy is near the end. -+ */ -+ out = chunkunroll_relaxed(out, &dist, &len); -+ out = chunkcopy_safe(out, out - dist, len, limit); -+ } else { -+ /* from points to window, so there is no risk of -+ overlapping pointers requiring memset-like behaviour -+ */ -+ out = chunkcopy_safe(out, from, len, limit); - } - } - else { -- from = out - dist; /* copy direct from output */ -- do { /* minimum length is three */ -- *out++ = *from++; -- *out++ = *from++; -- *out++ = *from++; -- len -= 3; -- } while (len > 2); -- if (len) { -- *out++ = *from++; -- if (len > 1) -- *out++ = *from++; -- } -+ /* Whole reference is in range of current output. No -+ range checks are necessary because we start with room -+ for at least 258 bytes of output, so unroll and roundoff -+ operations can write beyond `out+len` so long as they -+ stay within 258 bytes of `out`. -+ */ -+ out = chunkcopy_lapped_relaxed(out, dist, len); - } - } - else if ((op & 64) == 0) { /* 2nd level distance code */ -diff --git a/contrib/arm/inflate.c b/contrib/arm/inflate.c -index ac333e8c..e40322c3 100644 ---- a/contrib/arm/inflate.c -+++ b/contrib/arm/inflate.c -@@ -84,6 +84,7 @@ - #include "inftrees.h" - #include "inflate.h" - #include "inffast.h" -+#include "contrib/arm/chunkcopy.h" - - #ifdef MAKEFIXED - # ifndef BUILDFIXED -@@ -405,10 +406,20 @@ unsigned copy; - - /* if it hasn't been done already, allocate space for the window */ - if (state->window == Z_NULL) { -+ unsigned wsize = 1U << state->wbits; - state->window = (unsigned char FAR *) -- ZALLOC(strm, 1U << state->wbits, -+ ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE, - sizeof(unsigned char)); - if (state->window == Z_NULL) return 1; -+#ifdef INFLATE_CLEAR_UNUSED_UNDEFINED -+ /* Copies from the overflow portion of this buffer are undefined and -+ may cause analysis tools to raise a warning if we don't initialize -+ it. However, this undefined data overwrites other undefined data -+ and is subsequently either overwritten or left deliberately -+ undefined at the end of decode; so there's really no point. -+ */ -+ memset(state->window + wsize, 0, CHUNKCOPY_CHUNK_SIZE); -+#endif - } - - /* if window not in use yet, initialize */ -@@ -1175,17 +1186,16 @@ int flush; - else - from = state->window + (state->wnext - copy); - if (copy > state->length) copy = state->length; -+ if (copy > left) copy = left; -+ put = chunkcopy_safe(put, from, copy, put + left); - } - else { /* copy from output */ -- from = put - state->offset; - copy = state->length; -+ if (copy > left) copy = left; -+ put = chunkcopy_lapped_safe(put, state->offset, copy, put + left); - } -- if (copy > left) copy = left; - left -= copy; - state->length -= copy; -- do { -- *put++ = *from++; -- } while (--copy); - if (state->length == 0) state->mode = LEN; - break; - case LIT: diff --git a/package/libs/zlib/patches/003-arm-specific-optimisations-for-inflate.patch b/package/libs/zlib/patches/003-arm-specific-optimisations-for-inflate.patch new file mode 100644 index 0000000000..9370264c40 --- /dev/null +++ b/package/libs/zlib/patches/003-arm-specific-optimisations-for-inflate.patch @@ -0,0 +1,501 @@ +From 247147654fe5cd11cf15d8dff91440405ea57040 Mon Sep 17 00:00:00 2001 +From: Simon Hosie +Date: Wed, 12 Apr 2017 15:44:21 -0700 +Subject: [PATCH 2/2] Inflate using wider loads and stores + +In inflate_fast() the output pointer always has plenty of room to write. This +means that so long as the target is capable, wide un-aligned loads and stores +can be used to transfer several bytes at once. When the reference distance is +too short simply unroll the data a little to increase the distance. + +Change-Id: I59854eb25d2b1e43561c8a2afaf9175bf10cf674 +--- + contrib/arm/chunkcopy.h | 279 ++++++++++++++++++++++++++++++++++++++++++++++++ + contrib/arm/inffast.c | 96 +++++++---------- + contrib/arm/inflate.c | 22 ++-- + 3 files changed, 335 insertions(+), 62 deletions(-) + create mode 100644 contrib/arm/chunkcopy.h + +diff --git a/contrib/arm/chunkcopy.h b/contrib/arm/chunkcopy.h +new file mode 100644 +index 00000000..2d6fd6f9 +--- /dev/null ++++ b/contrib/arm/chunkcopy.h +@@ -0,0 +1,279 @@ ++/* chunkcopy.h -- fast copies and sets ++ * Copyright (C) 2017 ARM, Inc. ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++#ifndef CHUNKCOPY_H ++#define CHUNKCOPY_H ++ ++#include "zutil.h" ++#include ++ ++#if __STDC_VERSION__ >= 199901L ++#define Z_RESTRICT restrict ++#else ++#define Z_RESTRICT ++#endif ++ ++typedef uint8x16_t chunkcopy_chunk_t; ++#define CHUNKCOPY_CHUNK_SIZE sizeof(chunkcopy_chunk_t) ++ ++/* ++ Ask the compiler to perform a wide, unaligned load with an machine ++ instruction appropriate for the chunkcopy_chunk_t type. ++ */ ++static inline chunkcopy_chunk_t loadchunk(const unsigned char FAR *s) { ++ chunkcopy_chunk_t c; ++ __builtin_memcpy(&c, s, sizeof(c)); ++ return c; ++} ++ ++/* ++ Ask the compiler to perform a wide, unaligned store with an machine ++ instruction appropriate for the chunkcopy_chunk_t type. ++ */ ++static inline void storechunk(unsigned char FAR *d, chunkcopy_chunk_t c) { ++ __builtin_memcpy(d, &c, sizeof(c)); ++} ++ ++/* ++ Perform a memcpy-like operation, but assume that length is non-zero and that ++ it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if ++ the length is shorter than this. ++ ++ It also guarantees that it will properly unroll the data if the distance ++ between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on ++ in chunkcopy_relaxed(). ++ ++ Aside from better memory bus utilisation, this means that short copies ++ (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop ++ without iteration, which will hopefully make the branch prediction more ++ reliable. ++ */ ++static inline unsigned char FAR *chunkcopy_core(unsigned char FAR *out, ++ const unsigned char FAR *from, ++ unsigned len) { ++ int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1; ++ storechunk(out, loadchunk(from)); ++ out += bump; ++ from += bump; ++ len /= CHUNKCOPY_CHUNK_SIZE; ++ while (len-- > 0) { ++ storechunk(out, loadchunk(from)); ++ out += CHUNKCOPY_CHUNK_SIZE; ++ from += CHUNKCOPY_CHUNK_SIZE; ++ } ++ return out; ++} ++ ++/* ++ Like chunkcopy_core, but avoid writing beyond of legal output. ++ ++ Accepts an additional pointer to the end of safe output. A generic safe ++ copy would use (out + len), but it's normally the case that the end of the ++ output buffer is beyond the end of the current copy, and this can still be ++ exploited. ++ */ ++static inline unsigned char FAR *chunkcopy_core_safe(unsigned char FAR *out, ++ const unsigned char FAR * from, ++ unsigned len, ++ unsigned char FAR *limit) { ++ Assert(out + len <= limit, "chunk copy exceeds safety limit"); ++ if (limit - out < CHUNKCOPY_CHUNK_SIZE) { ++ const unsigned char FAR * Z_RESTRICT rfrom = from; ++ if (len & 8) { __builtin_memcpy(out, rfrom, 8); out += 8; rfrom += 8; } ++ if (len & 4) { __builtin_memcpy(out, rfrom, 4); out += 4; rfrom += 4; } ++ if (len & 2) { __builtin_memcpy(out, rfrom, 2); out += 2; rfrom += 2; } ++ if (len & 1) { *out++ = *rfrom++; } ++ return out; ++ } ++ return chunkcopy_core(out, from, len); ++} ++ ++/* ++ Perform short copies until distance can be rewritten as being at least ++ CHUNKCOPY_CHUNK_SIZE. ++ ++ This assumes that it's OK to overwrite at least the first ++ 2*CHUNKCOPY_CHUNK_SIZE bytes of output even if the copy is shorter than ++ this. This assumption holds within inflate_fast() which starts every ++ iteration with at least 258 bytes of output space available (258 being the ++ maximum length output from a single token; see inffast.c). ++ */ ++static inline unsigned char FAR *chunkunroll_relaxed(unsigned char FAR *out, ++ unsigned FAR *dist, ++ unsigned FAR *len) { ++ const unsigned char FAR *from = out - *dist; ++ while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) { ++ storechunk(out, loadchunk(from)); ++ out += *dist; ++ *len -= *dist; ++ *dist += *dist; ++ } ++ return out; ++} ++ ++ ++static inline uint8x16_t chunkset_vld1q_dup_u8x8(const unsigned char FAR * Z_RESTRICT from) { ++#if defined(__clang__) || defined(__aarch64__) ++ return vreinterpretq_u8_u64(vld1q_dup_u64((void *)from)); ++#else ++ /* 32-bit GCC uses an alignment hint for vld1q_dup_u64, even when given a ++ * void pointer, so here's an alternate implementation. ++ */ ++ uint8x8_t h = vld1_u8(from); ++ return vcombine_u8(h, h); ++#endif ++} ++ ++/* ++ Perform an overlapping copy which behaves as a memset() operation, but ++ supporting periods other than one, and assume that length is non-zero and ++ that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output ++ even if the length is shorter than this. ++ */ ++static inline unsigned char FAR *chunkset_core(unsigned char FAR *out, ++ unsigned period, ++ unsigned len) { ++ uint8x16_t f; ++ int bump = ((len - 1) % sizeof(f)) + 1; ++ ++ switch (period) { ++ case 1: ++ f = vld1q_dup_u8(out - 1); ++ vst1q_u8(out, f); ++ out += bump; ++ len -= bump; ++ while (len > 0) { ++ vst1q_u8(out, f); ++ out += sizeof(f); ++ len -= sizeof(f); ++ } ++ return out; ++ case 2: ++ f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2))); ++ vst1q_u8(out, f); ++ out += bump; ++ len -= bump; ++ if (len > 0) { ++ f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2))); ++ do { ++ vst1q_u8(out, f); ++ out += sizeof(f); ++ len -= sizeof(f); ++ } while (len > 0); ++ } ++ return out; ++ case 4: ++ f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4))); ++ vst1q_u8(out, f); ++ out += bump; ++ len -= bump; ++ if (len > 0) { ++ f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4))); ++ do { ++ vst1q_u8(out, f); ++ out += sizeof(f); ++ len -= sizeof(f); ++ } while (len > 0); ++ } ++ return out; ++ case 8: ++ f = chunkset_vld1q_dup_u8x8(out - 8); ++ vst1q_u8(out, f); ++ out += bump; ++ len -= bump; ++ if (len > 0) { ++ f = chunkset_vld1q_dup_u8x8(out - 8); ++ do { ++ vst1q_u8(out, f); ++ out += sizeof(f); ++ len -= sizeof(f); ++ } while (len > 0); ++ } ++ return out; ++ } ++ out = chunkunroll_relaxed(out, &period, &len); ++ return chunkcopy_core(out, out - period, len); ++} ++ ++/* ++ Perform a memcpy-like operation, but assume that length is non-zero and that ++ it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if ++ the length is shorter than this. ++ ++ Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour ++ of overlapping buffers, regardless of the distance between the pointers. ++ This is reflected in the `restrict`-qualified pointers, allowing the ++ compiler to reorder loads and stores. ++ */ ++static inline unsigned char FAR *chunkcopy_relaxed(unsigned char FAR * Z_RESTRICT out, ++ const unsigned char FAR * Z_RESTRICT from, ++ unsigned len) { ++ return chunkcopy_core(out, from, len); ++} ++ ++/* ++ Like chunkcopy_relaxed, but avoid writing beyond of legal output. ++ ++ Unlike chunkcopy_core_safe() above, no guarantee is made regarding the ++ behaviour of overlapping buffers, regardless of the distance between the ++ pointers. This is reflected in the `restrict`-qualified pointers, allowing ++ the compiler to reorder loads and stores. ++ ++ Accepts an additional pointer to the end of safe output. A generic safe ++ copy would use (out + len), but it's normally the case that the end of the ++ output buffer is beyond the end of the current copy, and this can still be ++ exploited. ++ */ ++static inline unsigned char FAR *chunkcopy_safe(unsigned char FAR *out, ++ const unsigned char FAR * Z_RESTRICT from, ++ unsigned len, ++ unsigned char FAR *limit) { ++ Assert(out + len <= limit, "chunk copy exceeds safety limit"); ++ return chunkcopy_core_safe(out, from, len, limit); ++} ++ ++/* ++ Perform chunky copy within the same buffer, where the source and destination ++ may potentially overlap. ++ ++ Assumes that len > 0 on entry, and that it's safe to write at least ++ CHUNKCOPY_CHUNK_SIZE*3 bytes to the output. ++ */ ++static inline unsigned char FAR *chunkcopy_lapped_relaxed(unsigned char FAR *out, ++ unsigned dist, ++ unsigned len) { ++ if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) { ++ return chunkset_core(out, dist, len); ++ } ++ return chunkcopy_core(out, out - dist, len); ++} ++ ++/* ++ Behave like chunkcopy_lapped_relaxed, but avoid writing beyond of legal output. ++ ++ Accepts an additional pointer to the end of safe output. A generic safe ++ copy would use (out + len), but it's normally the case that the end of the ++ output buffer is beyond the end of the current copy, and this can still be ++ exploited. ++ */ ++static inline unsigned char FAR *chunkcopy_lapped_safe(unsigned char FAR *out, ++ unsigned dist, ++ unsigned len, ++ unsigned char FAR *limit) { ++ Assert(out + len <= limit, "chunk copy exceeds safety limit"); ++ if (limit - out < CHUNKCOPY_CHUNK_SIZE * 3) { ++ /* TODO: try harder to optimise this */ ++ while (len-- > 0) { ++ *out = *(out - dist); ++ out++; ++ } ++ return out; ++ } ++ return chunkcopy_lapped_relaxed(out, dist, len); ++} ++ ++#undef Z_RESTRICT ++ ++#endif /* CHUNKCOPY_H */ +diff --git a/contrib/arm/inffast.c b/contrib/arm/inffast.c +index 0dbd1dbc..f7f50071 100644 +--- a/contrib/arm/inffast.c ++++ b/contrib/arm/inffast.c +@@ -7,6 +7,7 @@ + #include "inftrees.h" + #include "inflate.h" + #include "inffast.h" ++#include "chunkcopy.h" + + #ifdef ASMINF + # pragma message("Assembler code may have bugs -- use at your own risk") +@@ -57,6 +58,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ + unsigned char FAR *out; /* local strm->next_out */ + unsigned char FAR *beg; /* inflate()'s initial strm->next_out */ + unsigned char FAR *end; /* while out < end, enough space available */ ++ unsigned char FAR *limit; /* safety limit for chunky copies */ + #ifdef INFLATE_STRICT + unsigned dmax; /* maximum distance from zlib header */ + #endif +@@ -84,12 +86,13 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ + out = strm->next_out; + beg = out - (start - strm->avail_out); + end = out + (strm->avail_out - 257); ++ limit = out + strm->avail_out; + #ifdef INFLATE_STRICT + dmax = state->dmax; + #endif + wsize = state->wsize; + whave = state->whave; +- wnext = state->wnext; ++ wnext = (state->wnext == 0 && whave >= wsize) ? wsize : state->wnext; + window = state->window; + hold = state->hold; + bits = state->bits; +@@ -197,70 +200,51 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */ + #endif + } + from = window; +- if (wnext == 0) { /* very common case */ +- from += wsize - op; +- if (op < len) { /* some from window */ +- len -= op; +- do { +- *out++ = *from++; +- } while (--op); +- from = out - dist; /* rest from output */ +- } ++ if (wnext >= op) { /* contiguous in window */ ++ from += wnext - op; + } +- else if (wnext < op) { /* wrap around window */ +- from += wsize + wnext - op; ++ else { /* wrap around window */ + op -= wnext; ++ from += wsize - op; + if (op < len) { /* some from end of window */ + len -= op; +- do { +- *out++ = *from++; +- } while (--op); +- from = window; +- if (wnext < len) { /* some from start of window */ +- op = wnext; +- len -= op; +- do { +- *out++ = *from++; +- } while (--op); +- from = out - dist; /* rest from output */ +- } ++ out = chunkcopy_safe(out, from, op, limit); ++ from = window; /* more from start of window */ ++ op = wnext; ++ /* This (rare) case can create a situation where ++ the first chunkcopy below must be checked. ++ */ + } + } +- else { /* contiguous in window */ +- from += wnext - op; +- if (op < len) { /* some from window */ +- len -= op; +- do { +- *out++ = *from++; +- } while (--op); +- from = out - dist; /* rest from output */ +- } +- } +- while (len > 2) { +- *out++ = *from++; +- *out++ = *from++; +- *out++ = *from++; +- len -= 3; +- } +- if (len) { +- *out++ = *from++; +- if (len > 1) +- *out++ = *from++; ++ if (op < len) { /* still need some from output */ ++ out = chunkcopy_safe(out, from, op, limit); ++ len -= op; ++ /* When dist is small the amount of data that can be ++ copied from the window is also small, and progress ++ towards the dangerous end of the output buffer is ++ also small. This means that for trivial memsets and ++ for chunkunroll_relaxed() a safety check is ++ unnecessary. However, these conditions may not be ++ entered at all, and in that case it's possible that ++ the main copy is near the end. ++ */ ++ out = chunkunroll_relaxed(out, &dist, &len); ++ out = chunkcopy_safe(out, out - dist, len, limit); ++ } else { ++ /* from points to window, so there is no risk of ++ overlapping pointers requiring memset-like behaviour ++ */ ++ out = chunkcopy_safe(out, from, len, limit); + } + } + else { +- from = out - dist; /* copy direct from output */ +- do { /* minimum length is three */ +- *out++ = *from++; +- *out++ = *from++; +- *out++ = *from++; +- len -= 3; +- } while (len > 2); +- if (len) { +- *out++ = *from++; +- if (len > 1) +- *out++ = *from++; +- } ++ /* Whole reference is in range of current output. No ++ range checks are necessary because we start with room ++ for at least 258 bytes of output, so unroll and roundoff ++ operations can write beyond `out+len` so long as they ++ stay within 258 bytes of `out`. ++ */ ++ out = chunkcopy_lapped_relaxed(out, dist, len); + } + } + else if ((op & 64) == 0) { /* 2nd level distance code */ +diff --git a/contrib/arm/inflate.c b/contrib/arm/inflate.c +index ac333e8c..e40322c3 100644 +--- a/contrib/arm/inflate.c ++++ b/contrib/arm/inflate.c +@@ -84,6 +84,7 @@ + #include "inftrees.h" + #include "inflate.h" + #include "inffast.h" ++#include "contrib/arm/chunkcopy.h" + + #ifdef MAKEFIXED + # ifndef BUILDFIXED +@@ -405,10 +406,20 @@ unsigned copy; + + /* if it hasn't been done already, allocate space for the window */ + if (state->window == Z_NULL) { ++ unsigned wsize = 1U << state->wbits; + state->window = (unsigned char FAR *) +- ZALLOC(strm, 1U << state->wbits, ++ ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE, + sizeof(unsigned char)); + if (state->window == Z_NULL) return 1; ++#ifdef INFLATE_CLEAR_UNUSED_UNDEFINED ++ /* Copies from the overflow portion of this buffer are undefined and ++ may cause analysis tools to raise a warning if we don't initialize ++ it. However, this undefined data overwrites other undefined data ++ and is subsequently either overwritten or left deliberately ++ undefined at the end of decode; so there's really no point. ++ */ ++ memset(state->window + wsize, 0, CHUNKCOPY_CHUNK_SIZE); ++#endif + } + + /* if window not in use yet, initialize */ +@@ -1175,17 +1186,16 @@ int flush; + else + from = state->window + (state->wnext - copy); + if (copy > state->length) copy = state->length; ++ if (copy > left) copy = left; ++ put = chunkcopy_safe(put, from, copy, put + left); + } + else { /* copy from output */ +- from = put - state->offset; + copy = state->length; ++ if (copy > left) copy = left; ++ put = chunkcopy_lapped_safe(put, state->offset, copy, put + left); + } +- if (copy > left) copy = left; + left -= copy; + state->length -= copy; +- do { +- *put++ = *from++; +- } while (--copy); + if (state->length == 0) state->mode = LEN; + break; + case LIT: diff --git a/package/libs/zlib/patches/003-attach-sourcefiles-in-patch-002-to-buildsystem.patch b/package/libs/zlib/patches/003-attach-sourcefiles-in-patch-002-to-buildsystem.patch deleted file mode 100644 index 68f317b24b..0000000000 --- a/package/libs/zlib/patches/003-attach-sourcefiles-in-patch-002-to-buildsystem.patch +++ /dev/null @@ -1,100 +0,0 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 8e75f66..24d7329 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -95,34 +95,67 @@ set(ZLIB_PUBLIC_HDRS - ${CMAKE_CURRENT_BINARY_DIR}/zconf.h - zlib.h - ) --set(ZLIB_PRIVATE_HDRS -- crc32.h -- deflate.h -- gzguts.h -- inffast.h -- inffixed.h -- inflate.h -- inftrees.h -- trees.h -- zutil.h --) --set(ZLIB_SRCS -- adler32.c -- compress.c -- crc32.c -- deflate.c -- gzclose.c -- gzlib.c -- gzread.c -- gzwrite.c -- inflate.c -- infback.c -- inftrees.c -- inffast.c -- trees.c -- uncompr.c -- zutil.c --) -+ -+if(ARMv8) -+ set(ZLIB_PRIVATE_HDRS -+ crc32.h -+ deflate.h -+ gzguts.h -+ inffast.h -+ inffixed.h -+ inflate.h -+ inftrees.h -+ trees.h -+ zutil.h -+ contrib/arm/chunkcopy.h -+ ) -+ set(ZLIB_SRCS -+ adler32.c -+ compress.c -+ crc32.c -+ deflate.c -+ gzclose.c -+ gzlib.c -+ gzread.c -+ gzwrite.c -+ infback.c -+ inftrees.c -+ contrib/arm/inflate.c -+ contrib/arm/inffast.c -+ trees.c -+ uncompr.c -+ zutil.c -+ ) -+ else() -+ set(ZLIB_PRIVATE_HDRS -+ crc32.h -+ deflate.h -+ gzguts.h -+ inffast.h -+ inffixed.h -+ inflate.h -+ inftrees.h -+ trees.h -+ zutil.h -+ ) -+ set(ZLIB_SRCS -+ adler32.c -+ compress.c -+ crc32.c -+ deflate.c -+ gzclose.c -+ gzlib.c -+ gzread.c -+ gzwrite.c -+ inflate.c -+ infback.c -+ inftrees.c -+ inffast.c -+ trees.c -+ uncompr.c -+ zutil.c -+ ) -+endif() - - if(NOT MINGW) - set(ZLIB_DLL_SRCS diff --git a/package/libs/zlib/patches/004-attach-sourcefiles-in-patch-002-to-buildsystem.patch b/package/libs/zlib/patches/004-attach-sourcefiles-in-patch-002-to-buildsystem.patch new file mode 100644 index 0000000000..68f317b24b --- /dev/null +++ b/package/libs/zlib/patches/004-attach-sourcefiles-in-patch-002-to-buildsystem.patch @@ -0,0 +1,100 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 8e75f66..24d7329 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -95,34 +95,67 @@ set(ZLIB_PUBLIC_HDRS + ${CMAKE_CURRENT_BINARY_DIR}/zconf.h + zlib.h + ) +-set(ZLIB_PRIVATE_HDRS +- crc32.h +- deflate.h +- gzguts.h +- inffast.h +- inffixed.h +- inflate.h +- inftrees.h +- trees.h +- zutil.h +-) +-set(ZLIB_SRCS +- adler32.c +- compress.c +- crc32.c +- deflate.c +- gzclose.c +- gzlib.c +- gzread.c +- gzwrite.c +- inflate.c +- infback.c +- inftrees.c +- inffast.c +- trees.c +- uncompr.c +- zutil.c +-) ++ ++if(ARMv8) ++ set(ZLIB_PRIVATE_HDRS ++ crc32.h ++ deflate.h ++ gzguts.h ++ inffast.h ++ inffixed.h ++ inflate.h ++ inftrees.h ++ trees.h ++ zutil.h ++ contrib/arm/chunkcopy.h ++ ) ++ set(ZLIB_SRCS ++ adler32.c ++ compress.c ++ crc32.c ++ deflate.c ++ gzclose.c ++ gzlib.c ++ gzread.c ++ gzwrite.c ++ infback.c ++ inftrees.c ++ contrib/arm/inflate.c ++ contrib/arm/inffast.c ++ trees.c ++ uncompr.c ++ zutil.c ++ ) ++ else() ++ set(ZLIB_PRIVATE_HDRS ++ crc32.h ++ deflate.h ++ gzguts.h ++ inffast.h ++ inffixed.h ++ inflate.h ++ inftrees.h ++ trees.h ++ zutil.h ++ ) ++ set(ZLIB_SRCS ++ adler32.c ++ compress.c ++ crc32.c ++ deflate.c ++ gzclose.c ++ gzlib.c ++ gzread.c ++ gzwrite.c ++ inflate.c ++ infback.c ++ inftrees.c ++ inffast.c ++ trees.c ++ uncompr.c ++ zutil.c ++ ) ++endif() + + if(NOT MINGW) + set(ZLIB_DLL_SRCS diff --git a/package/libs/zlib/patches/004-relative-pkg-config-paths.patch b/package/libs/zlib/patches/004-relative-pkg-config-paths.patch deleted file mode 100644 index c86d19b412..0000000000 --- a/package/libs/zlib/patches/004-relative-pkg-config-paths.patch +++ /dev/null @@ -1,14 +0,0 @@ ---- a/zlib.pc.cmakein -+++ b/zlib.pc.cmakein -@@ -1,8 +1,8 @@ - prefix=@CMAKE_INSTALL_PREFIX@ - exec_prefix=@CMAKE_INSTALL_PREFIX@ --libdir=@INSTALL_LIB_DIR@ --sharedlibdir=@INSTALL_LIB_DIR@ --includedir=@INSTALL_INC_DIR@ -+libdir=${exec_prefix}/lib -+sharedlibdir=${exec_prefix}/lib -+includedir=${prefix}/include - - Name: zlib - Description: zlib compression library diff --git a/package/libs/zlib/patches/005-relative-pkg-config-paths.patch b/package/libs/zlib/patches/005-relative-pkg-config-paths.patch new file mode 100644 index 0000000000..c86d19b412 --- /dev/null +++ b/package/libs/zlib/patches/005-relative-pkg-config-paths.patch @@ -0,0 +1,14 @@ +--- a/zlib.pc.cmakein ++++ b/zlib.pc.cmakein +@@ -1,8 +1,8 @@ + prefix=@CMAKE_INSTALL_PREFIX@ + exec_prefix=@CMAKE_INSTALL_PREFIX@ +-libdir=@INSTALL_LIB_DIR@ +-sharedlibdir=@INSTALL_LIB_DIR@ +-includedir=@INSTALL_INC_DIR@ ++libdir=${exec_prefix}/lib ++sharedlibdir=${exec_prefix}/lib ++includedir=${prefix}/include + + Name: zlib + Description: zlib compression library