shuffle.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. /*********************************************************************
  2. Blosc - Blocked Shuffling and Compression Library
  3. Author: Francesc Alted <francesc@blosc.org>
  4. Creation date: 2009-05-20
  5. See LICENSES/BLOSC.txt for details about copyright and rights to use.
  6. **********************************************************************/
  7. #include "shuffle.h"
  8. #include "shuffle-common.h"
  9. #include "shuffle-generic.h"
  10. #include "bitshuffle-generic.h"
  11. #include <stdio.h>
  12. #include <string.h>
  13. /* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */
  14. #if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
  15. /* have a C99 compiler */
  16. typedef _Bool bool;
  17. #else
  18. /* do not have a C99 compiler */
  19. typedef unsigned char bool;
  20. #endif
  21. static const bool false = 0;
  22. static const bool true = 1;
  23. #if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
  24. __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
  25. #define HAVE_CPU_FEAT_INTRIN
  26. #endif
  27. /* Include hardware-accelerated shuffle/unshuffle routines based on
  28. the target architecture. Note that a target architecture may support
  29. more than one type of acceleration!*/
  30. #if defined(SHUFFLE_AVX2_ENABLED)
  31. #include "shuffle-avx2.h"
  32. #include "bitshuffle-avx2.h"
  33. #endif /* defined(SHUFFLE_AVX2_ENABLED) */
  34. #if defined(SHUFFLE_SSE2_ENABLED)
  35. #include "shuffle-sse2.h"
  36. #include "bitshuffle-sse2.h"
  37. #endif /* defined(SHUFFLE_SSE2_ENABLED) */
  38. /* Define function pointer types for shuffle/unshuffle routines. */
  39. typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
  40. typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
  41. typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*);
  42. typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*);
  43. /* An implementation of shuffle/unshuffle routines. */
  44. typedef struct shuffle_implementation {
  45. /* Name of this implementation. */
  46. const char* name;
  47. /* Function pointer to the shuffle routine for this implementation. */
  48. shuffle_func shuffle;
  49. /* Function pointer to the unshuffle routine for this implementation. */
  50. unshuffle_func unshuffle;
  51. /* Function pointer to the bitshuffle routine for this implementation. */
  52. bitshuffle_func bitshuffle;
  53. /* Function pointer to the bitunshuffle routine for this implementation. */
  54. bitunshuffle_func bitunshuffle;
  55. } shuffle_implementation_t;
  56. typedef enum {
  57. BLOSC_HAVE_NOTHING = 0,
  58. BLOSC_HAVE_SSE2 = 1,
  59. BLOSC_HAVE_AVX2 = 2
  60. } blosc_cpu_features;
  61. /* Detect hardware and set function pointers to the best shuffle/unshuffle
  62. implementations supported by the host processor. */
  63. #if defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED) /* Intel/i686 */
  64. /* Disabled the __builtin_cpu_supports() call, as it has issues with
  65. new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial:
  66. "undefined symbol: __cpu_model"
  67. For a similar report, see:
  68. https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
  69. */
  70. #if defined(HAVE_CPU_FEAT_INTRIN) && 0
  71. static blosc_cpu_features blosc_get_cpu_features(void) {
  72. blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
  73. if (__builtin_cpu_supports("sse2")) {
  74. cpu_features |= BLOSC_HAVE_SSE2;
  75. }
  76. if (__builtin_cpu_supports("avx2")) {
  77. cpu_features |= BLOSC_HAVE_AVX2;
  78. }
  79. return cpu_features;
  80. }
  81. #else
  82. #if defined(_MSC_VER) && !defined(__clang__)
  83. #include <intrin.h> /* Needed for __cpuid */
  84. /* _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */
  85. #if _MSC_FULL_VER >= 160040219
  86. #include <immintrin.h> /* Needed for _xgetbv */
  87. #elif defined(_M_IX86)
  88. /* Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */
  89. static uint64_t _xgetbv(uint32_t xcr) {
  90. uint32_t xcr0, xcr1;
  91. __asm {
  92. mov ecx, xcr
  93. _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
  94. mov xcr0, eax
  95. mov xcr1, edx
  96. }
  97. return ((uint64_t)xcr1 << 32) | xcr0;
  98. }
  99. #elif defined(_M_X64)
  100. /* Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets.
  101. These compilers don't support any of the newer acceleration ISAs
  102. (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2
  103. which means we can get away with returning a hard-coded value from
  104. this implementation of _xgetbv. */
  105. static inline uint64_t
  106. _xgetbv(uint32_t xcr) {
  107. /* A 64-bit OS must have XMM save support. */
  108. return xcr == 0 ? (1UL << 1) : 0UL;
  109. }
  110. #else
  111. /* Hardware detection for any other MSVC targets (e.g., ARM)
  112. isn't implemented at this time. */
  113. #error This version of c-blosc only supports x86 and x64 targets with MSVC.
  114. #endif /* _MSC_FULL_VER >= 160040219 */
  115. #else
  116. /* Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
  117. and others using inline assembly. */
  118. __attribute__((always_inline))
  119. static inline void
  120. __cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
  121. __asm__ __volatile__ (
  122. # if defined(__i386__) && defined (__PIC__)
  123. /* Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
  124. https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
  125. */
  126. "movl %%ebx, %%edi\n\t"
  127. "cpuid\n\t"
  128. "xchgl %%ebx, %%edi":
  129. "=D" (cpuInfo[1]),
  130. #else
  131. "cpuid":
  132. "=b" (cpuInfo[1]),
  133. #endif /* defined(__i386) && defined(__PIC__) */
  134. "=a" (cpuInfo[0]),
  135. "=c" (cpuInfo[2]),
  136. "=d" (cpuInfo[3]) :
  137. "a" (function_id), "c" (subfunction_id)
  138. );
  139. }
  140. #define __cpuid(cpuInfo, function_id) __cpuidex(cpuInfo, function_id, 0)
  141. #define _XCR_XFEATURE_ENABLED_MASK 0
  142. /* Reads the content of an extended control register.
  143. https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
  144. */
  145. static inline uint64_t
  146. _xgetbv(uint32_t xcr) {
  147. uint32_t eax, edx;
  148. __asm__ __volatile__ (
  149. /* "xgetbv"
  150. This is specified as raw instruction bytes due to some older compilers
  151. having issues with the mnemonic form.
  152. */
  153. ".byte 0x0f, 0x01, 0xd0":
  154. "=a" (eax),
  155. "=d" (edx) :
  156. "c" (xcr)
  157. );
  158. return ((uint64_t)edx << 32) | eax;
  159. }
  160. #endif /* defined(_MSC_FULL_VER) */
  161. #ifndef _XCR_XFEATURE_ENABLED_MASK
  162. #define _XCR_XFEATURE_ENABLED_MASK 0x0
  163. #endif
  164. static blosc_cpu_features blosc_get_cpu_features(void) {
  165. blosc_cpu_features result = BLOSC_HAVE_NOTHING;
  166. int32_t max_basic_function_id;
  167. /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
  168. int32_t cpu_info[4];
  169. int sse2_available;
  170. int sse3_available;
  171. int ssse3_available;
  172. int sse41_available;
  173. int sse42_available;
  174. int xsave_available;
  175. int xsave_enabled_by_os;
  176. int avx2_available = 0;
  177. int avx512bw_available = 0;
  178. int xmm_state_enabled = 0;
  179. int ymm_state_enabled = 0;
  180. int zmm_state_enabled = 0;
  181. uint64_t xcr0_contents;
  182. /* Get the number of basic functions available. */
  183. __cpuid(cpu_info, 0);
  184. max_basic_function_id = cpu_info[0];
  185. /* Check for SSE-based features and required OS support */
  186. __cpuid(cpu_info, 1);
  187. sse2_available = (cpu_info[3] & (1 << 26)) != 0;
  188. sse3_available = (cpu_info[2] & (1 << 0)) != 0;
  189. ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
  190. sse41_available = (cpu_info[2] & (1 << 19)) != 0;
  191. sse42_available = (cpu_info[2] & (1 << 20)) != 0;
  192. xsave_available = (cpu_info[2] & (1 << 26)) != 0;
  193. xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
  194. /* Check for AVX-based features, if the processor supports extended features. */
  195. if (max_basic_function_id >= 7) {
  196. __cpuid(cpu_info, 7);
  197. avx2_available = (cpu_info[1] & (1 << 5)) != 0;
  198. avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
  199. }
  200. /* Even if certain features are supported by the CPU, they may not be supported
  201. by the OS (in which case using them would crash the process or system).
  202. If xsave is available and enabled by the OS, check the contents of the
  203. extended control register XCR0 to see if the CPU features are enabled. */
  204. #if defined(_XCR_XFEATURE_ENABLED_MASK)
  205. if (xsave_available && xsave_enabled_by_os && (
  206. sse2_available || sse3_available || ssse3_available
  207. || sse41_available || sse42_available
  208. || avx2_available || avx512bw_available)) {
  209. /* Determine which register states can be restored by the OS. */
  210. xcr0_contents = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
  211. xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
  212. ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
  213. /* Require support for both the upper 256-bits of zmm0-zmm15 to be
  214. restored as well as all of zmm16-zmm31 and the opmask registers. */
  215. zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
  216. }
  217. #endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
  218. #if defined(BLOSC_DUMP_CPU_INFO)
  219. printf("Shuffle CPU Information:\n");
  220. printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
  221. printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
  222. printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
  223. printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
  224. printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
  225. printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
  226. printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
  227. printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
  228. printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
  229. printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
  230. printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
  231. printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
  232. #endif /* defined(BLOSC_DUMP_CPU_INFO) */
  233. /* Using the gathered CPU information, determine which implementation to use. */
  234. /* technically could fail on sse2 cpu on os without xmm support, but that
  235. * shouldn't exist anymore */
  236. if (sse2_available) {
  237. result |= BLOSC_HAVE_SSE2;
  238. }
  239. if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
  240. result |= BLOSC_HAVE_AVX2;
  241. }
  242. return result;
  243. }
  244. #endif
  245. #else /* No hardware acceleration supported for the target architecture. */
  246. #if defined(_MSC_VER)
  247. #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
  248. #else
  249. #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
  250. #endif
  251. static blosc_cpu_features blosc_get_cpu_features(void) {
  252. return BLOSC_HAVE_NOTHING;
  253. }
  254. #endif
  255. static shuffle_implementation_t get_shuffle_implementation() {
  256. blosc_cpu_features cpu_features = blosc_get_cpu_features();
  257. shuffle_implementation_t impl_generic;
  258. #if defined(SHUFFLE_AVX2_ENABLED)
  259. if (cpu_features & BLOSC_HAVE_AVX2) {
  260. shuffle_implementation_t impl_avx2;
  261. impl_avx2.name = "avx2";
  262. impl_avx2.shuffle = (shuffle_func)shuffle_avx2;
  263. impl_avx2.unshuffle = (unshuffle_func)unshuffle_avx2;
  264. impl_avx2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_avx2;
  265. impl_avx2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_avx2;
  266. return impl_avx2;
  267. }
  268. #endif /* defined(SHUFFLE_AVX2_ENABLED) */
  269. #if defined(SHUFFLE_SSE2_ENABLED)
  270. if (cpu_features & BLOSC_HAVE_SSE2) {
  271. shuffle_implementation_t impl_sse2;
  272. impl_sse2.name = "sse2";
  273. impl_sse2.shuffle = (shuffle_func)shuffle_sse2;
  274. impl_sse2.unshuffle = (unshuffle_func)unshuffle_sse2;
  275. impl_sse2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_sse2;
  276. impl_sse2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_sse2;
  277. return impl_sse2;
  278. }
  279. #endif /* defined(SHUFFLE_SSE2_ENABLED) */
  280. /* Processor doesn't support any of the hardware-accelerated implementations,
  281. so use the generic implementation. */
  282. impl_generic.name = "generic";
  283. impl_generic.shuffle = (shuffle_func)shuffle_generic;
  284. impl_generic.unshuffle = (unshuffle_func)unshuffle_generic;
  285. impl_generic.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal;
  286. impl_generic.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal;
  287. return impl_generic;
  288. }
  289. /* Flag indicating whether the implementation has been initialized.
  290. Zero means it hasn't been initialized, non-zero means it has. */
  291. static int32_t implementation_initialized;
  292. /* The dynamically-chosen shuffle/unshuffle implementation.
  293. This is only safe to use once `implementation_initialized` is set. */
  294. static shuffle_implementation_t host_implementation;
  295. /* Initialize the shuffle implementation, if necessary. */
  296. #if defined(__GNUC__) || defined(__clang__)
  297. __attribute__((always_inline))
  298. #endif
  299. static
  300. #if defined(_MSC_VER)
  301. __forceinline
  302. #else
  303. inline
  304. #endif
  305. void init_shuffle_implementation() {
  306. /* Initialization could (in rare cases) take place concurrently on
  307. multiple threads, but it shouldn't matter because the
  308. initialization should return the same result on each thread (so
  309. the implementation will be the same). Since that's the case we
  310. can avoid complicated synchronization here and get a small
  311. performance benefit because we don't need to perform a volatile
  312. load on the initialization variable each time this function is
  313. called. */
  314. #if defined(__GNUC__) || defined(__clang__)
  315. if (__builtin_expect(!implementation_initialized, 0)) {
  316. #else
  317. if (!implementation_initialized) {
  318. #endif
  319. /* Initialize the implementation. */
  320. host_implementation = get_shuffle_implementation();
  321. /* Set the flag indicating the implementation has been initialized. */
  322. implementation_initialized = 1;
  323. }
  324. }
  325. /* Shuffle a block by dynamically dispatching to the appropriate
  326. hardware-accelerated routine at run-time. */
  327. void
  328. shuffle(const size_t bytesoftype, const size_t blocksize,
  329. const uint8_t* _src, const uint8_t* _dest) {
  330. /* Initialize the shuffle implementation if necessary. */
  331. init_shuffle_implementation();
  332. /* The implementation is initialized.
  333. Dispatch to it's shuffle routine. */
  334. (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest);
  335. }
  336. /* Unshuffle a block by dynamically dispatching to the appropriate
  337. hardware-accelerated routine at run-time. */
  338. void
  339. unshuffle(const size_t bytesoftype, const size_t blocksize,
  340. const uint8_t* _src, const uint8_t* _dest) {
  341. /* Initialize the shuffle implementation if necessary. */
  342. init_shuffle_implementation();
  343. /* The implementation is initialized.
  344. Dispatch to it's unshuffle routine. */
  345. (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest);
  346. }
  347. /* Bit-shuffle a block by dynamically dispatching to the appropriate
  348. hardware-accelerated routine at run-time. */
  349. int
  350. bitshuffle(const size_t bytesoftype, const size_t blocksize,
  351. const uint8_t* const _src, const uint8_t* _dest,
  352. const uint8_t* _tmp) {
  353. int size = blocksize / bytesoftype;
  354. /* Initialize the shuffle implementation if necessary. */
  355. init_shuffle_implementation();
  356. if ((size % 8) == 0)
  357. /* The number of elems is a multiple of 8 which is supported by
  358. bitshuffle. */
  359. return (int)(host_implementation.bitshuffle)((void*)_src, (void*)_dest,
  360. blocksize / bytesoftype,
  361. bytesoftype, (void*)_tmp);
  362. else
  363. memcpy((void*)_dest, (void*)_src, blocksize);
  364. return size;
  365. }
  366. /* Bit-unshuffle a block by dynamically dispatching to the appropriate
  367. hardware-accelerated routine at run-time. */
  368. int
  369. bitunshuffle(const size_t bytesoftype, const size_t blocksize,
  370. const uint8_t* const _src, const uint8_t* _dest,
  371. const uint8_t* _tmp) {
  372. int size = blocksize / bytesoftype;
  373. /* Initialize the shuffle implementation if necessary. */
  374. init_shuffle_implementation();
  375. if ((size % 8) == 0)
  376. /* The number of elems is a multiple of 8 which is supported by
  377. bitshuffle. */
  378. return (int)(host_implementation.bitunshuffle)((void*)_src, (void*)_dest,
  379. blocksize / bytesoftype,
  380. bytesoftype, (void*)_tmp);
  381. else
  382. memcpy((void*)_dest, (void*)_src, blocksize);
  383. return size;
  384. }