shuffle.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. /*********************************************************************
  2. Blosc - Blocked Shuffling and Compression Library
  3. Author: Francesc Alted <francesc@blosc.org>
  4. Creation date: 2009-05-20
  5. See LICENSES/BLOSC.txt for details about copyright and rights to use.
  6. **********************************************************************/
  7. #include "shuffle.h"
  8. #include "shuffle-generic.h"
  9. #include "bitshuffle-generic.h"
  10. #include <stdio.h>
  11. /* Visual Studio < 2013 does not have stdbool.h so here it is a replacement: */
  12. #if defined __STDC__ && defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
  13. /* have a C99 compiler */
  14. typedef _Bool bool;
  15. #else
  16. /* do not have a C99 compiler */
  17. typedef unsigned char bool;
  18. #endif
  19. #if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
  20. __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
  21. #define HAVE_CPU_FEAT_INTRIN
  22. #endif
  23. /* Include hardware-accelerated shuffle/unshuffle routines based on
  24. the target architecture. Note that a target architecture may support
  25. more than one type of acceleration!*/
  26. #if defined(SHUFFLE_AVX2_ENABLED)
  27. #include "shuffle-avx2.h"
  28. #include "bitshuffle-avx2.h"
  29. #endif /* defined(SHUFFLE_AVX2_ENABLED) */
  30. #if defined(SHUFFLE_SSE2_ENABLED)
  31. #include "shuffle-sse2.h"
  32. #include "bitshuffle-sse2.h"
  33. #endif /* defined(SHUFFLE_SSE2_ENABLED) */
  34. /* Define function pointer types for shuffle/unshuffle routines. */
  35. typedef void(*shuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
  36. typedef void(*unshuffle_func)(const size_t, const size_t, const uint8_t*, const uint8_t*);
  37. typedef int64_t(*bitshuffle_func)(void*, void*, const size_t, const size_t, void*);
  38. typedef int64_t(*bitunshuffle_func)(void*, void*, const size_t, const size_t, void*);
  39. /* An implementation of shuffle/unshuffle routines. */
  40. typedef struct shuffle_implementation {
  41. /* Name of this implementation. */
  42. const char* name;
  43. /* Function pointer to the shuffle routine for this implementation. */
  44. shuffle_func shuffle;
  45. /* Function pointer to the unshuffle routine for this implementation. */
  46. unshuffle_func unshuffle;
  47. /* Function pointer to the bitshuffle routine for this implementation. */
  48. bitshuffle_func bitshuffle;
  49. /* Function pointer to the bitunshuffle routine for this implementation. */
  50. bitunshuffle_func bitunshuffle;
  51. } shuffle_implementation_t;
  52. typedef enum {
  53. BLOSC_HAVE_NOTHING = 0,
  54. BLOSC_HAVE_SSE2 = 1,
  55. BLOSC_HAVE_AVX2 = 2
  56. } blosc_cpu_features;
  57. /* Detect hardware and set function pointers to the best shuffle/unshuffle
  58. implementations supported by the host processor. */
  59. #if defined(SHUFFLE_AVX2_ENABLED) || defined(SHUFFLE_SSE2_ENABLED) /* Intel/i686 */
  60. /* Disabled the __builtin_cpu_supports() call, as it has issues with
  61. new versions of gcc (like 5.3.1 in forthcoming ubuntu/xenial:
  62. "undefined symbol: __cpu_model"
  63. For a similar report, see:
  64. https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/thread/ZM2L65WIZEEQHHLFERZYD5FAG7QY2OGB/
  65. */
  66. #if defined(HAVE_CPU_FEAT_INTRIN) && 0
  67. static blosc_cpu_features blosc_get_cpu_features(void) {
  68. blosc_cpu_features cpu_features = BLOSC_HAVE_NOTHING;
  69. if (__builtin_cpu_supports("sse2")) {
  70. cpu_features |= BLOSC_HAVE_SSE2;
  71. }
  72. if (__builtin_cpu_supports("avx2")) {
  73. cpu_features |= BLOSC_HAVE_AVX2;
  74. }
  75. return cpu_features;
  76. }
  77. #else
  78. #if defined(_MSC_VER) && !defined(__clang__)
  79. #include <intrin.h> /* Needed for __cpuid */
  80. /* _xgetbv is only supported by VS2010 SP1 and newer versions of VS. */
  81. #if _MSC_FULL_VER >= 160040219
  82. #include <immintrin.h> /* Needed for _xgetbv */
  83. #elif defined(_M_IX86)
  84. /* Implement _xgetbv for VS2008 and VS2010 RTM with 32-bit (x86) targets. */
  85. static uint64_t _xgetbv(uint32_t xcr) {
  86. uint32_t xcr0, xcr1;
  87. __asm {
  88. mov ecx, xcr
  89. _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
  90. mov xcr0, eax
  91. mov xcr1, edx
  92. }
  93. return ((uint64_t)xcr1 << 32) | xcr0;
  94. }
  95. #elif defined(_M_X64)
  96. /* Implement _xgetbv for VS2008 and VS2010 RTM with 64-bit (x64) targets.
  97. These compilers don't support any of the newer acceleration ISAs
  98. (e.g., AVX2) supported by blosc, and all x64 hardware supports SSE2
  99. which means we can get away with returning a hard-coded value from
  100. this implementation of _xgetbv. */
  101. static __inline uint64_t _xgetbv(uint32_t xcr) {
  102. /* A 64-bit OS must have XMM save support. */
  103. return (xcr == 0 ? (1UL << 1) : 0UL);
  104. }
  105. #else
  106. /* Hardware detection for any other MSVC targets (e.g., ARM)
  107. isn't implemented at this time. */
  108. #error This version of c-blosc only supports x86 and x64 targets with MSVC.
  109. #endif /* _MSC_FULL_VER >= 160040219 */
  110. #else
  111. /* Implement the __cpuid and __cpuidex intrinsics for GCC, Clang,
  112. and others using inline assembly. */
  113. __attribute__((always_inline))
  114. static inline void
  115. __cpuidex(int32_t cpuInfo[4], int32_t function_id, int32_t subfunction_id) {
  116. __asm__ __volatile__ (
  117. # if defined(__i386__) && defined (__PIC__)
  118. /* Can't clobber ebx with PIC running under 32-bit, so it needs to be manually restored.
  119. https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
  120. */
  121. "movl %%ebx, %%edi\n\t"
  122. "cpuid\n\t"
  123. "xchgl %%ebx, %%edi":
  124. "=D" (cpuInfo[1]),
  125. #else
  126. "cpuid":
  127. "=b" (cpuInfo[1]),
  128. #endif /* defined(__i386) && defined(__PIC__) */
  129. "=a" (cpuInfo[0]),
  130. "=c" (cpuInfo[2]),
  131. "=d" (cpuInfo[3]) :
  132. "a" (function_id), "c" (subfunction_id)
  133. );
  134. }
  135. #define __cpuid(cpuInfo, function_id) __cpuidex(cpuInfo, function_id, 0)
  136. #define _XCR_XFEATURE_ENABLED_MASK 0
  137. /* Reads the content of an extended control register.
  138. https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
  139. */
  140. static inline uint64_t
  141. _xgetbv(uint32_t xcr) {
  142. uint32_t eax, edx;
  143. __asm__ __volatile__ (
  144. /* "xgetbv"
  145. This is specified as raw instruction bytes due to some older compilers
  146. having issues with the mnemonic form.
  147. */
  148. ".byte 0x0f, 0x01, 0xd0":
  149. "=a" (eax),
  150. "=d" (edx) :
  151. "c" (xcr)
  152. );
  153. return ((uint64_t)edx << 32) | eax;
  154. }
  155. #endif /* defined(_MSC_FULL_VER) */
  156. #ifndef _XCR_XFEATURE_ENABLED_MASK
  157. #define _XCR_XFEATURE_ENABLED_MASK 0x0
  158. #endif
  159. static blosc_cpu_features blosc_get_cpu_features(void) {
  160. blosc_cpu_features result = BLOSC_HAVE_NOTHING;
  161. int32_t max_basic_function_id;
  162. /* Holds the values of eax, ebx, ecx, edx set by the `cpuid` instruction */
  163. int32_t cpu_info[4];
  164. int sse2_available;
  165. int sse3_available;
  166. int ssse3_available;
  167. int sse41_available;
  168. int sse42_available;
  169. int xsave_available;
  170. int xsave_enabled_by_os;
  171. int avx2_available = 0;
  172. int avx512bw_available = 0;
  173. int xmm_state_enabled = 0;
  174. int ymm_state_enabled = 0;
  175. int zmm_state_enabled = 0;
  176. uint64_t xcr0_contents;
  177. char* envvar;
  178. /* Get the number of basic functions available. */
  179. __cpuid(cpu_info, 0);
  180. max_basic_function_id = cpu_info[0];
  181. /* Check for SSE-based features and required OS support */
  182. __cpuid(cpu_info, 1);
  183. sse2_available = (cpu_info[3] & (1 << 26)) != 0;
  184. sse3_available = (cpu_info[2] & (1 << 0)) != 0;
  185. ssse3_available = (cpu_info[2] & (1 << 9)) != 0;
  186. sse41_available = (cpu_info[2] & (1 << 19)) != 0;
  187. sse42_available = (cpu_info[2] & (1 << 20)) != 0;
  188. xsave_available = (cpu_info[2] & (1 << 26)) != 0;
  189. xsave_enabled_by_os = (cpu_info[2] & (1 << 27)) != 0;
  190. /* Check for AVX-based features, if the processor supports extended features. */
  191. if (max_basic_function_id >= 7) {
  192. __cpuid(cpu_info, 7);
  193. avx2_available = (cpu_info[1] & (1 << 5)) != 0;
  194. avx512bw_available = (cpu_info[1] & (1 << 30)) != 0;
  195. }
  196. /* Even if certain features are supported by the CPU, they may not be supported
  197. by the OS (in which case using them would crash the process or system).
  198. If xsave is available and enabled by the OS, check the contents of the
  199. extended control register XCR0 to see if the CPU features are enabled. */
  200. #if defined(_XCR_XFEATURE_ENABLED_MASK)
  201. if (xsave_available && xsave_enabled_by_os && (
  202. sse2_available || sse3_available || ssse3_available
  203. || sse41_available || sse42_available
  204. || avx2_available || avx512bw_available)) {
  205. /* Determine which register states can be restored by the OS. */
  206. xcr0_contents = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
  207. xmm_state_enabled = (xcr0_contents & (1UL << 1)) != 0;
  208. ymm_state_enabled = (xcr0_contents & (1UL << 2)) != 0;
  209. /* Require support for both the upper 256-bits of zmm0-zmm15 to be
  210. restored as well as all of zmm16-zmm31 and the opmask registers. */
  211. zmm_state_enabled = (xcr0_contents & 0x70) == 0x70;
  212. }
  213. #endif /* defined(_XCR_XFEATURE_ENABLED_MASK) */
  214. envvar = getenv("BLOSC_PRINT_SHUFFLE_ACCEL");
  215. if (envvar != NULL) {
  216. printf("Shuffle CPU Information:\n");
  217. printf("SSE2 available: %s\n", sse2_available ? "True" : "False");
  218. printf("SSE3 available: %s\n", sse3_available ? "True" : "False");
  219. printf("SSSE3 available: %s\n", ssse3_available ? "True" : "False");
  220. printf("SSE4.1 available: %s\n", sse41_available ? "True" : "False");
  221. printf("SSE4.2 available: %s\n", sse42_available ? "True" : "False");
  222. printf("AVX2 available: %s\n", avx2_available ? "True" : "False");
  223. printf("AVX512BW available: %s\n", avx512bw_available ? "True" : "False");
  224. printf("XSAVE available: %s\n", xsave_available ? "True" : "False");
  225. printf("XSAVE enabled: %s\n", xsave_enabled_by_os ? "True" : "False");
  226. printf("XMM state enabled: %s\n", xmm_state_enabled ? "True" : "False");
  227. printf("YMM state enabled: %s\n", ymm_state_enabled ? "True" : "False");
  228. printf("ZMM state enabled: %s\n", zmm_state_enabled ? "True" : "False");
  229. }
  230. /* Using the gathered CPU information, determine which implementation to use. */
  231. /* technically could fail on sse2 cpu on os without xmm support, but that
  232. * shouldn't exist anymore */
  233. if (sse2_available) {
  234. result |= BLOSC_HAVE_SSE2;
  235. }
  236. if (xmm_state_enabled && ymm_state_enabled && avx2_available) {
  237. result |= BLOSC_HAVE_AVX2;
  238. }
  239. return result;
  240. }
  241. #endif
  242. #else /* No hardware acceleration supported for the target architecture. */
  243. #if defined(_MSC_VER)
  244. #pragma message("Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.")
  245. #else
  246. #warning Hardware-acceleration detection not implemented for the target architecture. Only the generic shuffle/unshuffle routines will be available.
  247. #endif
  248. static blosc_cpu_features blosc_get_cpu_features(void) {
  249. return BLOSC_HAVE_NOTHING;
  250. }
  251. #endif
  252. static shuffle_implementation_t get_shuffle_implementation(void) {
  253. blosc_cpu_features cpu_features = blosc_get_cpu_features();
  254. shuffle_implementation_t impl_generic;
  255. #if defined(SHUFFLE_AVX2_ENABLED)
  256. if (cpu_features & BLOSC_HAVE_AVX2) {
  257. shuffle_implementation_t impl_avx2;
  258. impl_avx2.name = "avx2";
  259. impl_avx2.shuffle = (shuffle_func)shuffle_avx2;
  260. impl_avx2.unshuffle = (unshuffle_func)unshuffle_avx2;
  261. impl_avx2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_avx2;
  262. impl_avx2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_avx2;
  263. return impl_avx2;
  264. }
  265. #endif /* defined(SHUFFLE_AVX2_ENABLED) */
  266. #if defined(SHUFFLE_SSE2_ENABLED)
  267. if (cpu_features & BLOSC_HAVE_SSE2) {
  268. shuffle_implementation_t impl_sse2;
  269. impl_sse2.name = "sse2";
  270. impl_sse2.shuffle = (shuffle_func)shuffle_sse2;
  271. impl_sse2.unshuffle = (unshuffle_func)unshuffle_sse2;
  272. impl_sse2.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_sse2;
  273. impl_sse2.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_sse2;
  274. return impl_sse2;
  275. }
  276. #endif /* defined(SHUFFLE_SSE2_ENABLED) */
  277. /* Processor doesn't support any of the hardware-accelerated implementations,
  278. so use the generic implementation. */
  279. impl_generic.name = "generic";
  280. impl_generic.shuffle = (shuffle_func)shuffle_generic;
  281. impl_generic.unshuffle = (unshuffle_func)unshuffle_generic;
  282. impl_generic.bitshuffle = (bitshuffle_func)bshuf_trans_bit_elem_scal;
  283. impl_generic.bitunshuffle = (bitunshuffle_func)bshuf_untrans_bit_elem_scal;
  284. return impl_generic;
  285. }
  286. /* Flag indicating whether the implementation has been initialized.
  287. Zero means it hasn't been initialized, non-zero means it has. */
  288. static int32_t implementation_initialized;
  289. /* The dynamically-chosen shuffle/unshuffle implementation.
  290. This is only safe to use once `implementation_initialized` is set. */
  291. static shuffle_implementation_t host_implementation;
  292. /* Initialize the shuffle implementation, if necessary. */
  293. #if defined(__GNUC__) || defined(__clang__)
  294. __attribute__((always_inline))
  295. #endif
  296. static
  297. #if defined(_MSC_VER)
  298. __forceinline
  299. #else
  300. inline
  301. #endif
  302. void init_shuffle_implementation(void) {
  303. /* Initialization could (in rare cases) take place concurrently on
  304. multiple threads, but it shouldn't matter because the
  305. initialization should return the same result on each thread (so
  306. the implementation will be the same). Since that's the case we
  307. can avoid complicated synchronization here and get a small
  308. performance benefit because we don't need to perform a volatile
  309. load on the initialization variable each time this function is
  310. called. */
  311. #if defined(__GNUC__) || defined(__clang__)
  312. if (__builtin_expect(!implementation_initialized, 0)) {
  313. #else
  314. if (!implementation_initialized) {
  315. #endif
  316. /* Initialize the implementation. */
  317. host_implementation = get_shuffle_implementation();
  318. /* Set the flag indicating the implementation has been initialized. */
  319. implementation_initialized = 1;
  320. }
  321. }
  322. /* Shuffle a block by dynamically dispatching to the appropriate
  323. hardware-accelerated routine at run-time. */
  324. void
  325. shuffle(const size_t bytesoftype, const size_t blocksize,
  326. const uint8_t* _src, const uint8_t* _dest) {
  327. /* Initialize the shuffle implementation if necessary. */
  328. init_shuffle_implementation();
  329. /* The implementation is initialized.
  330. Dispatch to it's shuffle routine. */
  331. (host_implementation.shuffle)(bytesoftype, blocksize, _src, _dest);
  332. }
  333. /* Unshuffle a block by dynamically dispatching to the appropriate
  334. hardware-accelerated routine at run-time. */
  335. void
  336. unshuffle(const size_t bytesoftype, const size_t blocksize,
  337. const uint8_t* _src, const uint8_t* _dest) {
  338. /* Initialize the shuffle implementation if necessary. */
  339. init_shuffle_implementation();
  340. /* The implementation is initialized.
  341. Dispatch to it's unshuffle routine. */
  342. (host_implementation.unshuffle)(bytesoftype, blocksize, _src, _dest);
  343. }
  344. /* Bit-shuffle a block by dynamically dispatching to the appropriate
  345. hardware-accelerated routine at run-time. */
  346. int
  347. bitshuffle(const size_t bytesoftype, const size_t blocksize,
  348. const uint8_t* const _src, const uint8_t* _dest,
  349. const uint8_t* _tmp) {
  350. int size = blocksize / bytesoftype;
  351. /* Initialize the shuffle implementation if necessary. */
  352. init_shuffle_implementation();
  353. if ((size % 8) == 0)
  354. /* The number of elems is a multiple of 8 which is supported by
  355. bitshuffle. */
  356. return (int)(host_implementation.bitshuffle)((void*)_src, (void*)_dest,
  357. blocksize / bytesoftype,
  358. bytesoftype, (void*)_tmp);
  359. else
  360. memcpy((void*)_dest, (void*)_src, blocksize);
  361. return size;
  362. }
  363. /* Bit-unshuffle a block by dynamically dispatching to the appropriate
  364. hardware-accelerated routine at run-time. */
  365. int
  366. bitunshuffle(const size_t bytesoftype, const size_t blocksize,
  367. const uint8_t* const _src, const uint8_t* _dest,
  368. const uint8_t* _tmp) {
  369. int size = blocksize / bytesoftype;
  370. /* Initialize the shuffle implementation if necessary. */
  371. init_shuffle_implementation();
  372. if ((size % 8) == 0)
  373. /* The number of elems is a multiple of 8 which is supported by
  374. bitshuffle. */
  375. return (int)(host_implementation.bitunshuffle)((void*)_src, (void*)_dest,
  376. blocksize / bytesoftype,
  377. bytesoftype, (void*)_tmp);
  378. else
  379. memcpy((void*)_dest, (void*)_src, blocksize);
  380. return size;
  381. }