123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504 |
- /*********************************************************************
- Blosc - Blocked Shuffling and Compression Library
- Author: Francesc Alted <francesc@blosc.org>
- Creation date: 2018-01-03
- See LICENSES/BLOSC.txt for details about copyright and rights to use.
- **********************************************************************/
- /*********************************************************************
- The code in this file is heavily based on memcopy.h, from the
- zlib-ng compression library. See LICENSES/ZLIB.txt for details.
- See also: https://github.com/Dead2/zlib-ng/blob/develop/zlib.h
- New implementations by Francesc Alted:
- * get_run() and get_match() familiy of functions
- * fast_copy() and safe_copy() functions
- * Support for SSE2/AVX2 copy instructions for these routines
- **********************************************************************/
- #include <assert.h>
- #include "blosc-common.h"
- static inline unsigned char *copy_1_bytes(unsigned char *out, const unsigned char *from) {
- *out++ = *from;
- return out;
- }
- static inline unsigned char *copy_2_bytes(unsigned char *out, const unsigned char *from) {
- #if defined(BLOSC_STRICT_ALIGN)
- uint16_t chunk;
- memcpy(&chunk, from, 2);
- memcpy(out, &chunk, 2);
- #else
- *(uint16_t *) out = *(uint16_t *) from;
- #endif
- return out + 2;
- }
- static inline unsigned char *copy_3_bytes(unsigned char *out, const unsigned char *from) {
- out = copy_1_bytes(out, from);
- return copy_2_bytes(out, from + 1);
- }
- static inline unsigned char *copy_4_bytes(unsigned char *out, const unsigned char *from) {
- #if defined(BLOSC_STRICT_ALIGN)
- uint32_t chunk;
- memcpy(&chunk, from, 4);
- memcpy(out, &chunk, 4);
- #else
- *(uint32_t *) out = *(uint32_t *) from;
- #endif
- return out + 4;
- }
- static inline unsigned char *copy_5_bytes(unsigned char *out, const unsigned char *from) {
- out = copy_1_bytes(out, from);
- return copy_4_bytes(out, from + 1);
- }
- static inline unsigned char *copy_6_bytes(unsigned char *out, const unsigned char *from) {
- out = copy_2_bytes(out, from);
- return copy_4_bytes(out, from + 2);
- }
- static inline unsigned char *copy_7_bytes(unsigned char *out, const unsigned char *from) {
- out = copy_3_bytes(out, from);
- return copy_4_bytes(out, from + 3);
- }
- static inline unsigned char *copy_8_bytes(unsigned char *out, const unsigned char *from) {
- #if defined(BLOSC_STRICT_ALIGN)
- uint64_t chunk;
- memcpy(&chunk, from, 8);
- memcpy(out, &chunk, 8);
- #else
- *(uint64_t *) out = *(uint64_t *) from;
- #endif
- return out + 8;
- }
- static inline unsigned char *copy_16_bytes(unsigned char *out, const unsigned char *from) {
- #if defined(__SSE2__)
- __m128i chunk;
- chunk = _mm_loadu_si128((__m128i*)from);
- _mm_storeu_si128((__m128i*)out, chunk);
- from += 16; out += 16;
- #elif !defined(BLOSC_STRICT_ALIGN)
- *(uint64_t*)out = *(uint64_t*)from;
- from += 8; out += 8;
- *(uint64_t*)out = *(uint64_t*)from;
- from += 8; out += 8;
- #else
- int i;
- for (i = 0; i < 16; i++) {
- *out++ = *from++;
- }
- #endif
- return out;
- }
- static inline unsigned char *copy_32_bytes(unsigned char *out, const unsigned char *from) {
- #if defined(__AVX2__)
- __m256i chunk;
- chunk = _mm256_loadu_si256((__m256i*)from);
- _mm256_storeu_si256((__m256i*)out, chunk);
- from += 32; out += 32;
- #elif defined(__SSE2__)
- __m128i chunk;
- chunk = _mm_loadu_si128((__m128i*)from);
- _mm_storeu_si128((__m128i*)out, chunk);
- from += 16; out += 16;
- chunk = _mm_loadu_si128((__m128i*)from);
- _mm_storeu_si128((__m128i*)out, chunk);
- from += 16; out += 16;
- #elif !defined(BLOSC_STRICT_ALIGN)
- *(uint64_t*)out = *(uint64_t*)from;
- from += 8; out += 8;
- *(uint64_t*)out = *(uint64_t*)from;
- from += 8; out += 8;
- *(uint64_t*)out = *(uint64_t*)from;
- from += 8; out += 8;
- *(uint64_t*)out = *(uint64_t*)from;
- from += 8; out += 8;
- #else
- int i;
- for (i = 0; i < 32; i++) {
- *out++ = *from++;
- }
- #endif
- return out;
- }
- #if defined(__AVX2__)
- static inline unsigned char *copy_32_bytes_aligned(unsigned char *out, const unsigned char *from) {
- __m256i chunk;
- chunk = _mm256_load_si256((__m256i*)from);
- _mm256_storeu_si256((__m256i*)out, chunk);
- return out + 32;
- }
- #endif // __AVX2__
- /* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */
- static inline unsigned char *copy_bytes(unsigned char *out, const unsigned char *from, unsigned len) {
- assert(len < 8);
- #ifdef BLOSC_STRICT_ALIGN
- while (len--) {
- *out++ = *from++;
- }
- #else
- switch (len) {
- case 7:
- return copy_7_bytes(out, from);
- case 6:
- return copy_6_bytes(out, from);
- case 5:
- return copy_5_bytes(out, from);
- case 4:
- return copy_4_bytes(out, from);
- case 3:
- return copy_3_bytes(out, from);
- case 2:
- return copy_2_bytes(out, from);
- case 1:
- return copy_1_bytes(out, from);
- case 0:
- return out;
- default:
- assert(0);
- }
- #endif /* BLOSC_STRICT_ALIGN */
- return out;
- }
- /* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
- static inline unsigned char *chunk_memcpy(unsigned char *out, const unsigned char *from, unsigned len) {
- unsigned sz = sizeof(uint64_t);
- unsigned rem = len % sz;
- unsigned by8;
- assert(len >= sz);
- /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
- copy_8_bytes(out, from);
- len /= sz;
- out += rem;
- from += rem;
- by8 = len % 8;
- len -= by8;
- switch (by8) {
- case 7:
- out = copy_8_bytes(out, from);
- from += sz;
- case 6:
- out = copy_8_bytes(out, from);
- from += sz;
- case 5:
- out = copy_8_bytes(out, from);
- from += sz;
- case 4:
- out = copy_8_bytes(out, from);
- from += sz;
- case 3:
- out = copy_8_bytes(out, from);
- from += sz;
- case 2:
- out = copy_8_bytes(out, from);
- from += sz;
- case 1:
- out = copy_8_bytes(out, from);
- from += sz;
- default:
- break;
- }
- while (len) {
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- len -= 8;
- }
- return out;
- }
- /* 16-byte version of chunk_memcpy() */
- static inline unsigned char *chunk_memcpy_16(unsigned char *out, const unsigned char *from, unsigned len) {
- unsigned sz = 16;
- unsigned rem = len % sz;
- unsigned ilen;
- assert(len >= sz);
- /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
- copy_16_bytes(out, from);
- len /= sz;
- out += rem;
- from += rem;
- for (ilen = 0; ilen < len; ilen++) {
- copy_16_bytes(out, from);
- out += sz;
- from += sz;
- }
- return out;
- }
- /* 32-byte version of chunk_memcpy() */
- static inline unsigned char *chunk_memcpy_32(unsigned char *out, const unsigned char *from, unsigned len) {
- unsigned sz = 32;
- unsigned rem = len % sz;
- unsigned ilen;
- assert(len >= sz);
- /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
- copy_32_bytes(out, from);
- len /= sz;
- out += rem;
- from += rem;
- for (ilen = 0; ilen < len; ilen++) {
- copy_32_bytes(out, from);
- out += sz;
- from += sz;
- }
- return out;
- }
- /* 32-byte *unrolled* version of chunk_memcpy() */
- static inline unsigned char *chunk_memcpy_32_unrolled(unsigned char *out, const unsigned char *from, unsigned len) {
- unsigned sz = 32;
- unsigned rem = len % sz;
- unsigned by8;
- assert(len >= sz);
- /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
- copy_32_bytes(out, from);
- len /= sz;
- out += rem;
- from += rem;
- by8 = len % 8;
- len -= by8;
- switch (by8) {
- case 7:
- out = copy_32_bytes(out, from);
- from += sz;
- case 6:
- out = copy_32_bytes(out, from);
- from += sz;
- case 5:
- out = copy_32_bytes(out, from);
- from += sz;
- case 4:
- out = copy_32_bytes(out, from);
- from += sz;
- case 3:
- out = copy_32_bytes(out, from);
- from += sz;
- case 2:
- out = copy_32_bytes(out, from);
- from += sz;
- case 1:
- out = copy_32_bytes(out, from);
- from += sz;
- default:
- break;
- }
- while (len) {
- out = copy_32_bytes(out, from);
- from += sz;
- out = copy_32_bytes(out, from);
- from += sz;
- out = copy_32_bytes(out, from);
- from += sz;
- out = copy_32_bytes(out, from);
- from += sz;
- out = copy_32_bytes(out, from);
- from += sz;
- out = copy_32_bytes(out, from);
- from += sz;
- out = copy_32_bytes(out, from);
- from += sz;
- out = copy_32_bytes(out, from);
- from += sz;
- len -= 8;
- }
- return out;
- }
- /* SSE2/AVX2 *unaligned* version of chunk_memcpy() */
- #if defined(__SSE2__) || defined(__AVX2__)
- static inline unsigned char *chunk_memcpy_unaligned(unsigned char *out, const unsigned char *from, unsigned len) {
- #if defined(__AVX2__)
- unsigned sz = sizeof(__m256i);
- #elif defined(__SSE2__)
- unsigned sz = sizeof(__m128i);
- #endif
- unsigned rem = len % sz;
- unsigned ilen;
- assert(len >= sz);
- /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
- #if defined(__AVX2__)
- copy_32_bytes(out, from);
- #elif defined(__SSE2__)
- copy_16_bytes(out, from);
- #endif
- len /= sz;
- out += rem;
- from += rem;
- for (ilen = 0; ilen < len; ilen++) {
- #if defined(__AVX2__)
- copy_32_bytes(out, from);
- #elif defined(__SSE2__)
- copy_16_bytes(out, from);
- #endif
- out += sz;
- from += sz;
- }
- return out;
- }
- #endif // __SSE2__ || __AVX2__
- #if defined(__SSE2__) || defined(__AVX2__)
- /* SSE2/AVX2 *aligned* version of chunk_memcpy() */
- static inline unsigned char *chunk_memcpy_aligned(unsigned char *out, const unsigned char *from, unsigned len) {
- #if defined(__AVX2__)
- unsigned sz = sizeof(__m256i);
- __m256i chunk;
- #elif defined(__SSE2__)
- unsigned sz = sizeof(__m128i);
- __m128i chunk;
- #endif
- unsigned bytes_to_align = sz - (unsigned)(((uintptr_t)(const void *)(from)) % sz);
- unsigned corrected_len = len - bytes_to_align;
- unsigned rem = corrected_len % sz;
- unsigned ilen;
- assert(len >= sz);
- /* Copy a few bytes to make sure the loop below has aligned access. */
- #if defined(__AVX2__)
- chunk = _mm256_loadu_si256((__m256i *) from);
- _mm256_storeu_si256((__m256i *) out, chunk);
- #elif defined(__SSE2__)
- chunk = _mm_loadu_si128((__m128i *) from);
- _mm_storeu_si128((__m128i *) out, chunk);
- #endif
- out += bytes_to_align;
- from += bytes_to_align;
- len = corrected_len / sz;
- for (ilen = 0; ilen < len; ilen++) {
- #if defined(__AVX2__)
- chunk = _mm256_load_si256((__m256i *) from); /* *aligned* load */
- _mm256_storeu_si256((__m256i *) out, chunk);
- #elif defined(__SSE2__)
- chunk = _mm_load_si128((__m128i *) from); /* *aligned* load */
- _mm_storeu_si128((__m128i *) out, chunk);
- #endif
- out += sz;
- from += sz;
- }
- /* Copy remaining bytes */
- if (rem < 8) {
- out = copy_bytes(out, from, rem);
- }
- else {
- out = chunk_memcpy(out, from, rem);
- }
- return out;
- }
- #endif // __AVX2__ || __SSE2__
- /* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
- unsigned char *fastcopy(unsigned char *out, const unsigned char *from, unsigned len) {
- switch (len) {
- case 32:
- return copy_32_bytes(out, from);
- case 16:
- return copy_16_bytes(out, from);
- case 8:
- return copy_8_bytes(out, from);
- default: {
- }
- }
- if (len < 8) {
- return copy_bytes(out, from, len);
- }
- #if defined(__SSE2__)
- if (len < 16) {
- return chunk_memcpy(out, from, len);
- }
- #if !defined(__AVX2__)
- return chunk_memcpy_unaligned(out, from, len);
- #else
- if (len < 32) {
- return chunk_memcpy_16(out, from, len);
- }
- return chunk_memcpy_unaligned(out, from, len);
- #endif // !__AVX2__
- #endif // __SSE2__
- return chunk_memcpy(out, from, len);
- }
- /* Same as fastcopy() but without overwriting origin or destination when they overlap */
- unsigned char* safecopy(unsigned char *out, const unsigned char *from, unsigned len) {
- #if defined(__AVX2__)
- unsigned sz = sizeof(__m256i);
- #elif defined(__SSE2__)
- unsigned sz = sizeof(__m128i);
- #else
- unsigned sz = sizeof(uint64_t);
- #endif
- if (out - sz < from) {
- for (; len; --len) {
- *out++ = *from++;
- }
- return out;
- }
- else {
- return fastcopy(out, from, len);
- }
- }
|