bitshuffle-avx2.c 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. /*
  2. * Bitshuffle - Filter for improving compression of typed binary data.
  3. *
  4. * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
  5. * Website: http://www.github.com/kiyo-masui/bitshuffle
  6. * Created: 2014
  7. *
  8. * Note: Adapted for c-blosc by Francesc Alted.
  9. *
  10. * See LICENSES/BITSHUFFLE.txt file for details about copyright and
  11. * rights to use.
  12. *
  13. */
  14. #include "bitshuffle-generic.h"
  15. #include "bitshuffle-sse2.h"
  16. #include "bitshuffle-avx2.h"
  17. /* Make sure AVX2 is available for the compilation target and compiler. */
  18. #if !defined(__AVX2__)
  19. #error AVX2 is not supported by the target architecture/platform and/or this compiler.
  20. #endif
  21. #include <immintrin.h>
  22. /* The next is useful for debugging purposes */
  23. #if 0
  24. #include <stdio.h>
  25. #include <string.h>
  26. static void printymm(__m256i ymm0)
  27. {
  28. uint8_t buf[32];
  29. ((__m256i *)buf)[0] = ymm0;
  30. printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
  31. buf[0], buf[1], buf[2], buf[3],
  32. buf[4], buf[5], buf[6], buf[7],
  33. buf[8], buf[9], buf[10], buf[11],
  34. buf[12], buf[13], buf[14], buf[15],
  35. buf[16], buf[17], buf[18], buf[19],
  36. buf[20], buf[21], buf[22], buf[23],
  37. buf[24], buf[25], buf[26], buf[27],
  38. buf[28], buf[29], buf[30], buf[31]);
  39. }
  40. #endif
  41. /* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
  42. /* Transpose bits within bytes. */
  43. int64_t bshuf_trans_bit_byte_avx2(void* in, void* out, const size_t size,
  44. const size_t elem_size) {
  45. char* in_b = (char*) in;
  46. char* out_b = (char*) out;
  47. int32_t* out_i32;
  48. size_t nbyte = elem_size * size;
  49. int64_t count;
  50. __m256i ymm;
  51. int32_t bt;
  52. size_t ii, kk;
  53. for (ii = 0; ii + 31 < nbyte; ii += 32) {
  54. ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
  55. for (kk = 0; kk < 8; kk++) {
  56. bt = _mm256_movemask_epi8(ymm);
  57. ymm = _mm256_slli_epi16(ymm, 1);
  58. out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
  59. *out_i32 = bt;
  60. }
  61. }
  62. count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
  63. nbyte - nbyte % 32);
  64. return count;
  65. }
  66. /* Transpose bits within elements. */
  67. int64_t bshuf_trans_bit_elem_avx2(void* in, void* out, const size_t size,
  68. const size_t elem_size, void* tmp_buf) {
  69. int64_t count;
  70. CHECK_MULT_EIGHT(size);
  71. count = bshuf_trans_byte_elem_sse2(in, out, size, elem_size, tmp_buf);
  72. CHECK_ERR(count);
  73. count = bshuf_trans_bit_byte_avx2(out, tmp_buf, size, elem_size);
  74. CHECK_ERR(count);
  75. count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
  76. return count;
  77. }
  78. /* For data organized into a row for each bit (8 * elem_size rows), transpose
  79. * the bytes. */
  80. int64_t bshuf_trans_byte_bitrow_avx2(void* in, void* out, const size_t size,
  81. const size_t elem_size) {
  82. char* in_b = (char*) in;
  83. char* out_b = (char*) out;
  84. size_t nrows = 8 * elem_size;
  85. size_t nbyte_row = size / 8;
  86. size_t ii, jj, kk, hh, mm;
  87. CHECK_MULT_EIGHT(size);
  88. if (elem_size % 4)
  89. return bshuf_trans_byte_bitrow_sse2(in, out, size, elem_size);
  90. __m256i ymm_0[8];
  91. __m256i ymm_1[8];
  92. __m256i ymm_storeage[8][4];
  93. for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
  94. for (ii = 0; ii + 3 < elem_size; ii += 4) {
  95. for (hh = 0; hh < 4; hh ++) {
  96. for (kk = 0; kk < 8; kk ++){
  97. ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
  98. (ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
  99. }
  100. for (kk = 0; kk < 4; kk ++){
  101. ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
  102. ymm_0[kk * 2 + 1]);
  103. ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
  104. ymm_0[kk * 2 + 1]);
  105. }
  106. for (kk = 0; kk < 2; kk ++){
  107. for (mm = 0; mm < 2; mm ++){
  108. ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
  109. ymm_1[kk * 4 + mm * 2],
  110. ymm_1[kk * 4 + mm * 2 + 1]);
  111. ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
  112. ymm_1[kk * 4 + mm * 2],
  113. ymm_1[kk * 4 + mm * 2 + 1]);
  114. }
  115. }
  116. for (kk = 0; kk < 4; kk ++){
  117. ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
  118. ymm_0[kk * 2 + 1]);
  119. ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
  120. ymm_0[kk * 2 + 1]);
  121. }
  122. for (kk = 0; kk < 8; kk ++){
  123. ymm_storeage[kk][hh] = ymm_1[kk];
  124. }
  125. }
  126. for (mm = 0; mm < 8; mm ++) {
  127. for (kk = 0; kk < 4; kk ++){
  128. ymm_0[kk] = ymm_storeage[mm][kk];
  129. }
  130. ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
  131. ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
  132. ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
  133. ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);
  134. ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
  135. ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
  136. ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
  137. ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);
  138. _mm256_storeu_si256((__m256i *) &out_b[
  139. (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
  140. _mm256_storeu_si256((__m256i *) &out_b[
  141. (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
  142. _mm256_storeu_si256((__m256i *) &out_b[
  143. (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
  144. _mm256_storeu_si256((__m256i *) &out_b[
  145. (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
  146. }
  147. }
  148. }
  149. for (ii = 0; ii < nrows; ii ++ ) {
  150. for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
  151. out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
  152. }
  153. }
  154. return size * elem_size;
  155. }
  156. /* Shuffle bits within the bytes of eight element blocks. */
  157. int64_t bshuf_shuffle_bit_eightelem_avx2(void* in, void* out, const size_t size,
  158. const size_t elem_size) {
  159. CHECK_MULT_EIGHT(size);
  160. /* With a bit of care, this could be written such that such that it is */
  161. /* in_buf = out_buf safe. */
  162. char* in_b = (char*) in;
  163. char* out_b = (char*) out;
  164. size_t nbyte = elem_size * size;
  165. size_t ii, jj, kk, ind;
  166. __m256i ymm;
  167. int32_t bt;
  168. if (elem_size % 4) {
  169. return bshuf_shuffle_bit_eightelem_sse2(in, out, size, elem_size);
  170. } else {
  171. for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
  172. for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
  173. ii += 8 * elem_size) {
  174. ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
  175. for (kk = 0; kk < 8; kk++) {
  176. bt = _mm256_movemask_epi8(ymm);
  177. ymm = _mm256_slli_epi16(ymm, 1);
  178. ind = (ii + jj / 8 + (7 - kk) * elem_size);
  179. * (int32_t *) &out_b[ind] = bt;
  180. }
  181. }
  182. }
  183. }
  184. return size * elem_size;
  185. }
  186. /* Untranspose bits within elements. */
  187. int64_t bshuf_untrans_bit_elem_avx2(void* in, void* out, const size_t size,
  188. const size_t elem_size, void* tmp_buf) {
  189. int64_t count;
  190. CHECK_MULT_EIGHT(size);
  191. count = bshuf_trans_byte_bitrow_avx2(in, tmp_buf, size, elem_size);
  192. CHECK_ERR(count);
  193. count = bshuf_shuffle_bit_eightelem_avx2(tmp_buf, out, size, elem_size);
  194. return count;
  195. }