bitshuffle-sse2.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. /*
  2. * Bitshuffle - Filter for improving compression of typed binary data.
  3. *
  4. * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
  5. * Website: http://www.github.com/kiyo-masui/bitshuffle
  6. * Created: 2014
  7. *
  8. * Note: Adapted for c-blosc by Francesc Alted.
  9. *
  10. * See LICENSES/BITSHUFFLE.txt file for details about copyright and
  11. * rights to use.
  12. *
  13. */
  14. #include "bitshuffle-generic.h"
  15. #include "bitshuffle-sse2.h"
  16. /* Make sure SSE2 is available for the compilation target and compiler. */
  17. #if !defined(__SSE2__)
  18. #error SSE2 is not supported by the target architecture/platform and/or this compiler.
  19. #endif
  20. #include <emmintrin.h>
  21. /* The next is useful for debugging purposes */
  22. #if 0
  23. #include <stdio.h>
  24. #include <string.h>
  25. static void printxmm(__m128i xmm0)
  26. {
  27. uint8_t buf[32];
  28. ((__m128i *)buf)[0] = xmm0;
  29. printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
  30. buf[0], buf[1], buf[2], buf[3],
  31. buf[4], buf[5], buf[6], buf[7],
  32. buf[8], buf[9], buf[10], buf[11],
  33. buf[12], buf[13], buf[14], buf[15]);
  34. }
  35. #endif
  36. /* ---- Worker code that requires SSE2. Intel Petium 4 (2000) and later. ---- */
  37. /* Transpose bytes within elements for 16 bit elements. */
  38. int64_t bshuf_trans_byte_elem_SSE_16(void* in, void* out, const size_t size) {
  39. char* in_b = (char*) in;
  40. char* out_b = (char*) out;
  41. __m128i a0, b0, a1, b1;
  42. size_t ii;
  43. for (ii=0; ii + 15 < size; ii += 16) {
  44. a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]);
  45. b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]);
  46. a1 = _mm_unpacklo_epi8(a0, b0);
  47. b1 = _mm_unpackhi_epi8(a0, b0);
  48. a0 = _mm_unpacklo_epi8(a1, b1);
  49. b0 = _mm_unpackhi_epi8(a1, b1);
  50. a1 = _mm_unpacklo_epi8(a0, b0);
  51. b1 = _mm_unpackhi_epi8(a0, b0);
  52. a0 = _mm_unpacklo_epi8(a1, b1);
  53. b0 = _mm_unpackhi_epi8(a1, b1);
  54. _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
  55. _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
  56. }
  57. return bshuf_trans_byte_elem_remainder(in, out, size, 2,
  58. size - size % 16);
  59. }
  60. /* Transpose bytes within elements for 32 bit elements. */
  61. int64_t bshuf_trans_byte_elem_SSE_32(void* in, void* out, const size_t size) {
  62. char* in_b = (char*) in;
  63. char* out_b = (char*) out;
  64. __m128i a0, b0, c0, d0, a1, b1, c1, d1;
  65. size_t ii;
  66. for (ii=0; ii + 15 < size; ii += 16) {
  67. a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]);
  68. b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]);
  69. c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]);
  70. d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]);
  71. a1 = _mm_unpacklo_epi8(a0, b0);
  72. b1 = _mm_unpackhi_epi8(a0, b0);
  73. c1 = _mm_unpacklo_epi8(c0, d0);
  74. d1 = _mm_unpackhi_epi8(c0, d0);
  75. a0 = _mm_unpacklo_epi8(a1, b1);
  76. b0 = _mm_unpackhi_epi8(a1, b1);
  77. c0 = _mm_unpacklo_epi8(c1, d1);
  78. d0 = _mm_unpackhi_epi8(c1, d1);
  79. a1 = _mm_unpacklo_epi8(a0, b0);
  80. b1 = _mm_unpackhi_epi8(a0, b0);
  81. c1 = _mm_unpacklo_epi8(c0, d0);
  82. d1 = _mm_unpackhi_epi8(c0, d0);
  83. a0 = _mm_unpacklo_epi64(a1, c1);
  84. b0 = _mm_unpackhi_epi64(a1, c1);
  85. c0 = _mm_unpacklo_epi64(b1, d1);
  86. d0 = _mm_unpackhi_epi64(b1, d1);
  87. _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
  88. _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
  89. _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
  90. _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
  91. }
  92. return bshuf_trans_byte_elem_remainder(in, out, size, 4,
  93. size - size % 16);
  94. }
  95. /* Transpose bytes within elements for 64 bit elements. */
  96. int64_t bshuf_trans_byte_elem_SSE_64(void* in, void* out, const size_t size) {
  97. char* in_b = (char*) in;
  98. char* out_b = (char*) out;
  99. __m128i a0, b0, c0, d0, e0, f0, g0, h0;
  100. __m128i a1, b1, c1, d1, e1, f1, g1, h1;
  101. size_t ii;
  102. for (ii=0; ii + 15 < size; ii += 16) {
  103. a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]);
  104. b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]);
  105. c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]);
  106. d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]);
  107. e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]);
  108. f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]);
  109. g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]);
  110. h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]);
  111. a1 = _mm_unpacklo_epi8(a0, b0);
  112. b1 = _mm_unpackhi_epi8(a0, b0);
  113. c1 = _mm_unpacklo_epi8(c0, d0);
  114. d1 = _mm_unpackhi_epi8(c0, d0);
  115. e1 = _mm_unpacklo_epi8(e0, f0);
  116. f1 = _mm_unpackhi_epi8(e0, f0);
  117. g1 = _mm_unpacklo_epi8(g0, h0);
  118. h1 = _mm_unpackhi_epi8(g0, h0);
  119. a0 = _mm_unpacklo_epi8(a1, b1);
  120. b0 = _mm_unpackhi_epi8(a1, b1);
  121. c0 = _mm_unpacklo_epi8(c1, d1);
  122. d0 = _mm_unpackhi_epi8(c1, d1);
  123. e0 = _mm_unpacklo_epi8(e1, f1);
  124. f0 = _mm_unpackhi_epi8(e1, f1);
  125. g0 = _mm_unpacklo_epi8(g1, h1);
  126. h0 = _mm_unpackhi_epi8(g1, h1);
  127. a1 = _mm_unpacklo_epi32(a0, c0);
  128. b1 = _mm_unpackhi_epi32(a0, c0);
  129. c1 = _mm_unpacklo_epi32(b0, d0);
  130. d1 = _mm_unpackhi_epi32(b0, d0);
  131. e1 = _mm_unpacklo_epi32(e0, g0);
  132. f1 = _mm_unpackhi_epi32(e0, g0);
  133. g1 = _mm_unpacklo_epi32(f0, h0);
  134. h1 = _mm_unpackhi_epi32(f0, h0);
  135. a0 = _mm_unpacklo_epi64(a1, e1);
  136. b0 = _mm_unpackhi_epi64(a1, e1);
  137. c0 = _mm_unpacklo_epi64(b1, f1);
  138. d0 = _mm_unpackhi_epi64(b1, f1);
  139. e0 = _mm_unpacklo_epi64(c1, g1);
  140. f0 = _mm_unpackhi_epi64(c1, g1);
  141. g0 = _mm_unpacklo_epi64(d1, h1);
  142. h0 = _mm_unpackhi_epi64(d1, h1);
  143. _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
  144. _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
  145. _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
  146. _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
  147. _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0);
  148. _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0);
  149. _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0);
  150. _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0);
  151. }
  152. return bshuf_trans_byte_elem_remainder(in, out, size, 8,
  153. size - size % 16);
  154. }
  155. /* Memory copy with bshuf call signature. */
  156. int64_t bshuf_copy(void* in, void* out, const size_t size,
  157. const size_t elem_size) {
  158. char* in_b = (char*) in;
  159. char* out_b = (char*) out;
  160. memcpy(out_b, in_b, size * elem_size);
  161. return size * elem_size;
  162. }
  163. /* Transpose bytes within elements using best SSE algorithm available. */
  164. int64_t bshuf_trans_byte_elem_sse2(void* in, void* out, const size_t size,
  165. const size_t elem_size, void* tmp_buf) {
  166. int64_t count;
  167. /* Trivial cases: power of 2 bytes. */
  168. switch (elem_size) {
  169. case 1:
  170. count = bshuf_copy(in, out, size, elem_size);
  171. return count;
  172. case 2:
  173. count = bshuf_trans_byte_elem_SSE_16(in, out, size);
  174. return count;
  175. case 4:
  176. count = bshuf_trans_byte_elem_SSE_32(in, out, size);
  177. return count;
  178. case 8:
  179. count = bshuf_trans_byte_elem_SSE_64(in, out, size);
  180. return count;
  181. }
  182. /* Worst case: odd number of bytes. Turns out that this is faster for */
  183. /* (odd * 2) byte elements as well (hence % 4). */
  184. if (elem_size % 4) {
  185. count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
  186. return count;
  187. }
  188. /* Multiple of power of 2: transpose hierarchically. */
  189. {
  190. size_t nchunk_elem;
  191. if ((elem_size % 8) == 0) {
  192. nchunk_elem = elem_size / 8;
  193. TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
  194. count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf,
  195. size * nchunk_elem);
  196. bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
  197. } else if ((elem_size % 4) == 0) {
  198. nchunk_elem = elem_size / 4;
  199. TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
  200. count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf,
  201. size * nchunk_elem);
  202. bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
  203. } else {
  204. /* Not used since scalar algorithm is faster. */
  205. nchunk_elem = elem_size / 2;
  206. TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
  207. count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
  208. size * nchunk_elem);
  209. bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
  210. }
  211. return count;
  212. }
  213. }
  214. /* Transpose bits within bytes. */
  215. int64_t bshuf_trans_bit_byte_sse2(void* in, void* out, const size_t size,
  216. const size_t elem_size) {
  217. char* in_b = (char*) in;
  218. char* out_b = (char*) out;
  219. uint16_t* out_ui16;
  220. int64_t count;
  221. size_t nbyte = elem_size * size;
  222. __m128i xmm;
  223. int32_t bt;
  224. size_t ii, kk;
  225. CHECK_MULT_EIGHT(nbyte);
  226. for (ii = 0; ii + 15 < nbyte; ii += 16) {
  227. xmm = _mm_loadu_si128((__m128i *) &in_b[ii]);
  228. for (kk = 0; kk < 8; kk++) {
  229. bt = _mm_movemask_epi8(xmm);
  230. xmm = _mm_slli_epi16(xmm, 1);
  231. out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
  232. *out_ui16 = bt;
  233. }
  234. }
  235. count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
  236. nbyte - nbyte % 16);
  237. return count;
  238. }
  239. /* Transpose bits within elements. */
  240. int64_t bshuf_trans_bit_elem_sse2(void* in, void* out, const size_t size,
  241. const size_t elem_size, void* tmp_buf) {
  242. int64_t count;
  243. CHECK_MULT_EIGHT(size);
  244. count = bshuf_trans_byte_elem_sse2(in, out, size, elem_size, tmp_buf);
  245. CHECK_ERR(count);
  246. count = bshuf_trans_bit_byte_sse2(out, tmp_buf, size, elem_size);
  247. CHECK_ERR(count);
  248. count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
  249. return count;
  250. }
  251. /* For data organized into a row for each bit (8 * elem_size rows), transpose
  252. * the bytes. */
  253. int64_t bshuf_trans_byte_bitrow_sse2(void* in, void* out, const size_t size,
  254. const size_t elem_size) {
  255. char* in_b = (char*) in;
  256. char* out_b = (char*) out;
  257. size_t nrows = 8 * elem_size;
  258. size_t nbyte_row = size / 8;
  259. size_t ii, jj;
  260. __m128i a0, b0, c0, d0, e0, f0, g0, h0;
  261. __m128i a1, b1, c1, d1, e1, f1, g1, h1;
  262. __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
  263. CHECK_MULT_EIGHT(size);
  264. for (ii = 0; ii + 7 < nrows; ii += 8) {
  265. for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
  266. a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]);
  267. b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]);
  268. c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]);
  269. d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]);
  270. e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]);
  271. f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]);
  272. g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]);
  273. h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]);
  274. a1 = _mm_unpacklo_epi8(a0, b0);
  275. b1 = _mm_unpacklo_epi8(c0, d0);
  276. c1 = _mm_unpacklo_epi8(e0, f0);
  277. d1 = _mm_unpacklo_epi8(g0, h0);
  278. e1 = _mm_unpackhi_epi8(a0, b0);
  279. f1 = _mm_unpackhi_epi8(c0, d0);
  280. g1 = _mm_unpackhi_epi8(e0, f0);
  281. h1 = _mm_unpackhi_epi8(g0, h0);
  282. a0 = _mm_unpacklo_epi16(a1, b1);
  283. b0 = _mm_unpacklo_epi16(c1, d1);
  284. c0 = _mm_unpackhi_epi16(a1, b1);
  285. d0 = _mm_unpackhi_epi16(c1, d1);
  286. e0 = _mm_unpacklo_epi16(e1, f1);
  287. f0 = _mm_unpacklo_epi16(g1, h1);
  288. g0 = _mm_unpackhi_epi16(e1, f1);
  289. h0 = _mm_unpackhi_epi16(g1, h1);
  290. a1 = _mm_unpacklo_epi32(a0, b0);
  291. b1 = _mm_unpackhi_epi32(a0, b0);
  292. c1 = _mm_unpacklo_epi32(c0, d0);
  293. d1 = _mm_unpackhi_epi32(c0, d0);
  294. e1 = _mm_unpacklo_epi32(e0, f0);
  295. f1 = _mm_unpackhi_epi32(e0, f0);
  296. g1 = _mm_unpacklo_epi32(g0, h0);
  297. h1 = _mm_unpackhi_epi32(g0, h0);
  298. /* We don't have a storeh instruction for integers, so interpret */
  299. /* as a float. Have a storel (_mm_storel_epi64). */
  300. as = (__m128 *) &a1;
  301. bs = (__m128 *) &b1;
  302. cs = (__m128 *) &c1;
  303. ds = (__m128 *) &d1;
  304. es = (__m128 *) &e1;
  305. fs = (__m128 *) &f1;
  306. gs = (__m128 *) &g1;
  307. hs = (__m128 *) &h1;
  308. _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as);
  309. _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs);
  310. _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs);
  311. _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds);
  312. _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es);
  313. _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs);
  314. _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs);
  315. _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs);
  316. _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as);
  317. _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs);
  318. _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs);
  319. _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds);
  320. _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es);
  321. _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs);
  322. _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs);
  323. _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs);
  324. }
  325. for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
  326. out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
  327. out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
  328. out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
  329. out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
  330. out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
  331. out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
  332. out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
  333. out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
  334. }
  335. }
  336. return size * elem_size;
  337. }
  338. /* Shuffle bits within the bytes of eight element blocks. */
  339. int64_t bshuf_shuffle_bit_eightelem_sse2(void* in, void* out, const size_t size,
  340. const size_t elem_size) {
  341. /* With a bit of care, this could be written such that such that it is */
  342. /* in_buf = out_buf safe. */
  343. char* in_b = (char*) in;
  344. uint16_t* out_ui16 = (uint16_t*) out;
  345. size_t nbyte = elem_size * size;
  346. __m128i xmm;
  347. int32_t bt;
  348. size_t ii, jj, kk;
  349. size_t ind;
  350. CHECK_MULT_EIGHT(size);
  351. if (elem_size % 2) {
  352. bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
  353. } else {
  354. for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
  355. ii += 8 * elem_size) {
  356. for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
  357. xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]);
  358. for (kk = 0; kk < 8; kk++) {
  359. bt = _mm_movemask_epi8(xmm);
  360. xmm = _mm_slli_epi16(xmm, 1);
  361. ind = (ii + jj / 8 + (7 - kk) * elem_size);
  362. out_ui16[ind / 2] = bt;
  363. }
  364. }
  365. }
  366. }
  367. return size * elem_size;
  368. }
  369. /* Untranspose bits within elements. */
  370. int64_t bshuf_untrans_bit_elem_sse2(void* in, void* out, const size_t size,
  371. const size_t elem_size, void* tmp_buf) {
  372. int64_t count;
  373. CHECK_MULT_EIGHT(size);
  374. count = bshuf_trans_byte_bitrow_sse2(in, tmp_buf, size, elem_size);
  375. CHECK_ERR(count);
  376. count = bshuf_shuffle_bit_eightelem_sse2(tmp_buf, out, size, elem_size);
  377. return count;
  378. }