123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442 |
- /*********************************************************************
- Blosc - Blocked Shuffling and Compression Library
- Author: Francesc Alted <francesc@blosc.org>
- Creation date: 2009-05-20
- See LICENSES/BLOSC.txt for details about copyright and rights to use.
- **********************************************************************/
- #include "shuffle.h"
- #include "shuffle-generic.h"
- #include "bitshuffle-generic.h"
- #include <stdio.h>
- /* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */
- #if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
- /* have a C99 compiler */
- typedef _Bool bool;
- #else
- /* do not have a C99 compiler */
- typedef unsigned char bool;
- #endif
- #if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
- __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
- #define HAVE_CPU_FEAT_INTRIN
- #endif
- /* Include hardware-accelerated shuffle/unshuffle routines based on
- the target architecture. Note that a target architecture may support
- more than one type of acceleration!*/
- #if defined(SHUFFLE_AVX2_ENABLED)
- #include "shuffle-avx2.h"
- #include "bitshuffle-avx2.h"
- #endif /* defined(SHUFFLE_AVX2_ENABLED) */
- #if defined(SHUFFLE_SSE2_ENABLED)
- #include "shuffle-sse2.h"
- #include "bitshuffle-sse2.h"
- #endif /* defined(SHUFFLE_SSE2_ENABLED) */
- /* Define function pointer types for shuffle/unshuffle routines. */
- typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
- typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
- typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*);
- typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*);
- /* An implementation of shuffle/unshuffle routines. */
- typedef struct shuffle_implementation {
- /* Name of this implementation. */
- const char* name;
- /* Function pointer to the shuffle routine for this implementation. */
- shuffle_func shuffle;
- /* Function pointer to the unshuffle routine for this implementation. */
- unshuffle_func unshuffle;
- /* Function pointer to the bitshuffle routine for this implementation. */
- bitshuffle_func bitshuffle;
- /* Function pointer to the bitunshuffle routine for this implementation. */
- bitunshuffle_func bitunshuffle;
- } shuffle_implementation_t;
- typedef enum {
- BLOSC_HAVE_NOTHING = 0,
- BLOSC_HAVE_SSE2 = 1,
- BLOSC_HAVE_AVX2 = 2
- } blosc_cpu_features;
- /* Detect hardware and set function pointers to the best shuffle/unshuffle
- implementations supported by the host processor. */
- #if defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED) /* Intel/i686 */
- /* Disabled the __builtin_cpu_supports() call, as it has issues with
- new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial:
- "undefined symbol: __cpu_model"
- For a similar report, see:
- https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
- */
- #if defined(HAVE_CPU_FEAT_INTRIN) && 0
- static blosc_cpu_features blosc_get_cpu_features(void) {
- blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
- if (__builtin_cpu_supports("sse2")) {
- cpu_features |= BLOSC_HAVE_SSE2;
- }
- if (__builtin_cpu_supports("avx2")) {
- cpu_features |= BLOSC_HAVE_AVX2;
- }
- return cpu_features;
- }
- #else
- #if defined(_MSC_VER) && !defined(__clang__)
- #include <intrin.h> /* Needed for __cpuid */
- /* _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */
- #if _MSC_FULL_VER >= 160040219
- #include <immintrin.h> /* Needed for _xgetbv */
- #elif defined(_M_IX86)
- /* Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */
- static uint64_t _xgetbv(uint32_t xcr) {
- uint32_t xcr0, xcr1;
- __asm {
- mov ecx, xcr
- _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
- mov xcr0, eax
- mov xcr1, edx
- }
- return ((uint64_t)xcr1 << 32) | xcr0;
- }
- #elif defined(_M_X64)
- /* Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets.
- These compilers don't support any of the newer acceleration ISAs
- (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2
- which means we can get away with returning a hard-coded value from
- this implementation of _xgetbv. */
- static __inline uint64_t _xgetbv(uint32_t xcr) {
- /* A 64-bit OS must have XMM save support. */
- return (xcr == 0 ? (1UL << 1) : 0UL);
- }
- #else
- /* Hardware detection for any other MSVC targets (e.g., ARM)
- isn't implemented at this time. */
- #error This version of c-blosc only supports x86 and x64 targets with MSVC.
- #endif /* _MSC_FULL_VER >= 160040219 */
- #else
- /* Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
- and others using inline assembly. */
- __attribute__((always_inline))
- static inline void
- __cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
- __asm__ __volatile__ (
- # if defined(__i386__) && defined (__PIC__)
- /* Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
- https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
- */
- "movl %%ebx, %%edi\n\t"
- "cpuid\n\t"
- "xchgl %%ebx, %%edi":
- "=D" (cpuInfo[1]),
- #else
- "cpuid":
- "=b" (cpuInfo[1]),
- #endif /* defined(__i386) && defined(__PIC__) */
- "=a" (cpuInfo[0]),
- "=c" (cpuInfo[2]),
- "=d" (cpuInfo[3]) :
- "a" (function_id), "c" (subfunction_id)
- );
- }
- #define __cpuid(cpuInfo, function_id) __cpuidex(cpuInfo, function_id, 0)
- #define _XCR_XFEATURE_ENABLED_MASK 0
- /* Reads the content of an extended control register.
- https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
- */
- static inline uint64_t
- _xgetbv(uint32_t xcr) {
- uint32_t eax, edx;
- __asm__ __volatile__ (
- /* "xgetbv"
- This is specified as raw instruction bytes due to some older compilers
- having issues with the mnemonic form.
- */
- ".byte 0x0f, 0x01, 0xd0":
- "=a" (eax),
- "=d" (edx) :
- "c" (xcr)
- );
- return ((uint64_t)edx << 32) | eax;
- }
- #endif /* defined(_MSC_FULL_VER) */
- #ifndef _XCR_XFEATURE_ENABLED_MASK
- #define _XCR_XFEATURE_ENABLED_MASK 0x0
- #endif
- static blosc_cpu_features blosc_get_cpu_features(void) {
- blosc_cpu_features result = BLOSC_HAVE_NOTHING;
- int32_t max_basic_function_id;
- /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
- int32_t cpu_info[4];
- int sse2_available;
- int sse3_available;
- int ssse3_available;
- int sse41_available;
- int sse42_available;
- int xsave_available;
- int xsave_enabled_by_os;
- int avx2_available = 0;
- int avx512bw_available = 0;
- int xmm_state_enabled = 0;
- int ymm_state_enabled = 0;
- int zmm_state_enabled = 0;
- uint64_t xcr0_contents;
- char* envvar;
- /* Get the number of basic functions available. */
- __cpuid(cpu_info, 0);
- max_basic_function_id = cpu_info[0];
- /* Check for SSE-based features and required OS support */
- __cpuid(cpu_info, 1);
- sse2_available = (cpu_info[3] & (1 << 26)) != 0;
- sse3_available = (cpu_info[2] & (1 << 0)) != 0;
- ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
- sse41_available = (cpu_info[2] & (1 << 19)) != 0;
- sse42_available = (cpu_info[2] & (1 << 20)) != 0;
- xsave_available = (cpu_info[2] & (1 << 26)) != 0;
- xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
- /* Check for AVX-based features, if the processor supports extended features. */
- if (max_basic_function_id >= 7) {
- __cpuid(cpu_info, 7);
- avx2_available = (cpu_info[1] & (1 << 5)) != 0;
- avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
- }
- /* Even if certain features are supported by the CPU, they may not be supported
- by the OS (in which case using them would crash the process or system).
- If xsave is available and enabled by the OS, check the contents of the
- extended control register XCR0 to see if the CPU features are enabled. */
- #if defined(_XCR_XFEATURE_ENABLED_MASK)
- if (xsave_available && xsave_enabled_by_os && (
- sse2_available || sse3_available || ssse3_available
- || sse41_available || sse42_available
- || avx2_available || avx512bw_available)) {
- /* Determine which register states can be restored by the OS. */
- xcr0_contents = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
- xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
- ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
- /* Require support for both the upper 256-bits of zmm0-zmm15 to be
- restored as well as all of zmm16-zmm31 and the opmask registers. */
- zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
- }
- #endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
- envvar = getenv("BLOSC_PRINT_SHUFFLE_ACCEL");
- if (envvar != NULL) {
- printf("Shuffle CPU Information:\n");
- printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
- printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
- printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
- printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
- printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
- printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
- printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
- printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
- printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
- printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
- printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
- printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
- }
- /* Using the gathered CPU information, determine which implementation to use. */
- /* technically could fail on sse2 cpu on os without xmm support, but that
- * shouldn't exist anymore */
- if (sse2_available) {
- result |= BLOSC_HAVE_SSE2;
- }
- if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
- result |= BLOSC_HAVE_AVX2;
- }
- return result;
- }
- #endif
- #else /* No hardware acceleration supported for the target architecture. */
- #if defined(_MSC_VER)
- #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
- #else
- #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
- #endif
- static blosc_cpu_features blosc_get_cpu_features(void) {
- return BLOSC_HAVE_NOTHING;
- }
- #endif
- static shuffle_implementation_t get_shuffle_implementation(void) {
- blosc_cpu_features cpu_features = blosc_get_cpu_features();
- shuffle_implementation_t impl_generic;
- #if defined(SHUFFLE_AVX2_ENABLED)
- if (cpu_features & BLOSC_HAVE_AVX2) {
- shuffle_implementation_t impl_avx2;
- impl_avx2.name = "avx2";
- impl_avx2.shuffle = (shuffle_func)shuffle_avx2;
- impl_avx2.unshuffle = (unshuffle_func)unshuffle_avx2;
- impl_avx2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_avx2;
- impl_avx2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_avx2;
- return impl_avx2;
- }
- #endif /* defined(SHUFFLE_AVX2_ENABLED) */
- #if defined(SHUFFLE_SSE2_ENABLED)
- if (cpu_features & BLOSC_HAVE_SSE2) {
- shuffle_implementation_t impl_sse2;
- impl_sse2.name = "sse2";
- impl_sse2.shuffle = (shuffle_func)shuffle_sse2;
- impl_sse2.unshuffle = (unshuffle_func)unshuffle_sse2;
- impl_sse2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_sse2;
- impl_sse2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_sse2;
- return impl_sse2;
- }
- #endif /* defined(SHUFFLE_SSE2_ENABLED) */
- /* Processor doesn't support any of the hardware-accelerated implementations,
- so use the generic implementation. */
- impl_generic.name = "generic";
- impl_generic.shuffle = (shuffle_func)shuffle_generic;
- impl_generic.unshuffle = (unshuffle_func)unshuffle_generic;
- impl_generic.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal;
- impl_generic.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal;
- return impl_generic;
- }
- /* Flag indicating whether the implementation has been initialized.
- Zero means it hasn't been initialized, non-zero means it has. */
- static int32_t implementation_initialized;
- /* The dynamically-chosen shuffle/unshuffle implementation.
- This is only safe to use once `implementation_initialized` is set. */
- static shuffle_implementation_t host_implementation;
- /* Initialize the shuffle implementation, if necessary. */
- #if defined(__GNUC__) || defined(__clang__)
- __attribute__((always_inline))
- #endif
- static
- #if defined(_MSC_VER)
- __forceinline
- #else
- inline
- #endif
- void init_shuffle_implementation(void) {
- /* Initialization could (in rare cases) take place concurrently on
- multiple threads, but it shouldn't matter because the
- initialization should return the same result on each thread (so
- the implementation will be the same). Since that's the case we
- can avoid complicated synchronization here and get a small
- performance benefit because we don't need to perform a volatile
- load on the initialization variable each time this function is
- called. */
- #if defined(__GNUC__) || defined(__clang__)
- if (__builtin_expect(!implementation_initialized, 0)) {
- #else
- if (!implementation_initialized) {
- #endif
- /* Initialize the implementation. */
- host_implementation = get_shuffle_implementation();
- /* Set the flag indicating the implementation has been initialized. */
- implementation_initialized = 1;
- }
- }
- /* Shuffle a block by dynamically dispatching to the appropriate
- hardware-accelerated routine at run-time. */
- void
- shuffle(const size_t bytesoftype, const size_t blocksize,
- const uint8_t* _src, const uint8_t* _dest) {
- /* Initialize the shuffle implementation if necessary. */
- init_shuffle_implementation();
- /* The implementation is initialized.
- Dispatch to it's shuffle routine. */
- (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest);
- }
- /* Unshuffle a block by dynamically dispatching to the appropriate
- hardware-accelerated routine at run-time. */
- void
- unshuffle(const size_t bytesoftype, const size_t blocksize,
- const uint8_t* _src, const uint8_t* _dest) {
- /* Initialize the shuffle implementation if necessary. */
- init_shuffle_implementation();
- /* The implementation is initialized.
- Dispatch to it's unshuffle routine. */
- (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest);
- }
- /* Bit-shuffle a block by dynamically dispatching to the appropriate
- hardware-accelerated routine at run-time. */
- int
- bitshuffle(const size_t bytesoftype, const size_t blocksize,
- const uint8_t* const _src, const uint8_t* _dest,
- const uint8_t* _tmp) {
- int size = blocksize / bytesoftype;
- /* Initialize the shuffle implementation if necessary. */
- init_shuffle_implementation();
- if ((size % 8) == 0)
- /* The number of elems is a multiple of 8 which is supported by
- bitshuffle. */
- return (int)(host_implementation.bitshuffle)((void*)_src, (void*)_dest,
- blocksize / bytesoftype,
- bytesoftype, (void*)_tmp);
- else
- memcpy((void*)_dest, (void*)_src, blocksize);
- return size;
- }
- /* Bit-unshuffle a block by dynamically dispatching to the appropriate
- hardware-accelerated routine at run-time. */
- int
- bitunshuffle(const size_t bytesoftype, const size_t blocksize,
- const uint8_t* const _src, const uint8_t* _dest,
- const uint8_t* _tmp) {
- int size = blocksize / bytesoftype;
- /* Initialize the shuffle implementation if necessary. */
- init_shuffle_implementation();
- if ((size % 8) == 0)
- /* The number of elems is a multiple of 8 which is supported by
- bitshuffle. */
- return (int)(host_implementation.bitunshuffle)((void*)_src, (void*)_dest,
- blocksize / bytesoftype,
- bytesoftype, (void*)_tmp);
- else
- memcpy((void*)_dest, (void*)_src, blocksize);
- return size;
- }
|