123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248 |
- /*
- * Bitshuffle - Filter for improving compression of typed binary data.
- *
- * Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
- * Website: http://www.github.com/kiyo-masui/bitshuffle
- * Created: 2014
- *
- * Note: Adapted for c-blosc by Francesc Alted.
- *
- * See LICENSES/BITSHUFFLE.txt file for details about copyright and
- * rights to use.
- *
- */
- #include "bitshuffle-generic.h"
- #include "bitshuffle-sse2.h"
- #include "bitshuffle-avx2.h"
- /* Make sure AVX2 is available for the compilation target and compiler. */
- #if !defined(__AVX2__)
- #error AVX2 is not supported by the target architecture/platform and/or this compiler.
- #endif
- #include <immintrin.h>
- /* The next is useful for debugging purposes */
- #if 0
- #include <stdio.h>
- #include <string.h>
- static void printymm(__m256i ymm0)
- {
- uint8_t buf[32];
- ((__m256i *)buf)[0] = ymm0;
- printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
- buf[0], buf[1], buf[2], buf[3],
- buf[4], buf[5], buf[6], buf[7],
- buf[8], buf[9], buf[10], buf[11],
- buf[12], buf[13], buf[14], buf[15],
- buf[16], buf[17], buf[18], buf[19],
- buf[20], buf[21], buf[22], buf[23],
- buf[24], buf[25], buf[26], buf[27],
- buf[28], buf[29], buf[30], buf[31]);
- }
- #endif
- /* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
- /* Transpose bits within bytes. */
- int64_t bshuf_trans_bit_byte_avx2(void* in, void* out, const size_t size,
- const size_t elem_size) {
- char* in_b = (char*) in;
- char* out_b = (char*) out;
- int32_t* out_i32;
- size_t nbyte = elem_size * size;
- int64_t count;
- __m256i ymm;
- int32_t bt;
- size_t ii, kk;
- for (ii = 0; ii + 31 < nbyte; ii += 32) {
- ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
- for (kk = 0; kk < 8; kk++) {
- bt = _mm256_movemask_epi8(ymm);
- ymm = _mm256_slli_epi16(ymm, 1);
- out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
- *out_i32 = bt;
- }
- }
- count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
- nbyte - nbyte % 32);
- return count;
- }
- /* Transpose bits within elements. */
- int64_t bshuf_trans_bit_elem_avx2(void* in, void* out, const size_t size,
- const size_t elem_size, void* tmp_buf) {
- int64_t count;
- CHECK_MULT_EIGHT(size);
- count = bshuf_trans_byte_elem_sse2(in, out, size, elem_size, tmp_buf);
- CHECK_ERR(count);
- count = bshuf_trans_bit_byte_avx2(out, tmp_buf, size, elem_size);
- CHECK_ERR(count);
- count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
- return count;
- }
- /* For data organized into a row for each bit (8 * elem_size rows), transpose
- * the bytes. */
- int64_t bshuf_trans_byte_bitrow_avx2(void* in, void* out, const size_t size,
- const size_t elem_size) {
- char* in_b = (char*) in;
- char* out_b = (char*) out;
- size_t nrows = 8 * elem_size;
- size_t nbyte_row = size / 8;
- size_t ii, jj, kk, hh, mm;
- CHECK_MULT_EIGHT(size);
- if (elem_size % 4)
- return bshuf_trans_byte_bitrow_sse2(in, out, size, elem_size);
- __m256i ymm_0[8];
- __m256i ymm_1[8];
- __m256i ymm_storeage[8][4];
- for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
- for (ii = 0; ii + 3 < elem_size; ii += 4) {
- for (hh = 0; hh < 4; hh ++) {
- for (kk = 0; kk < 8; kk ++){
- ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
- (ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
- }
- for (kk = 0; kk < 4; kk ++){
- ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
- ymm_0[kk * 2 + 1]);
- ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
- ymm_0[kk * 2 + 1]);
- }
- for (kk = 0; kk < 2; kk ++){
- for (mm = 0; mm < 2; mm ++){
- ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
- ymm_1[kk * 4 + mm * 2],
- ymm_1[kk * 4 + mm * 2 + 1]);
- ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
- ymm_1[kk * 4 + mm * 2],
- ymm_1[kk * 4 + mm * 2 + 1]);
- }
- }
- for (kk = 0; kk < 4; kk ++){
- ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
- ymm_0[kk * 2 + 1]);
- ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
- ymm_0[kk * 2 + 1]);
- }
- for (kk = 0; kk < 8; kk ++){
- ymm_storeage[kk][hh] = ymm_1[kk];
- }
- }
- for (mm = 0; mm < 8; mm ++) {
- for (kk = 0; kk < 4; kk ++){
- ymm_0[kk] = ymm_storeage[mm][kk];
- }
- ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
- ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
- ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
- ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);
- ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
- ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
- ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
- ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);
- _mm256_storeu_si256((__m256i *) &out_b[
- (jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
- _mm256_storeu_si256((__m256i *) &out_b[
- (jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
- _mm256_storeu_si256((__m256i *) &out_b[
- (jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
- _mm256_storeu_si256((__m256i *) &out_b[
- (jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
- }
- }
- }
- for (ii = 0; ii < nrows; ii ++ ) {
- for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
- out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
- }
- }
- return size * elem_size;
- }
- /* Shuffle bits within the bytes of eight element blocks. */
- int64_t bshuf_shuffle_bit_eightelem_avx2(void* in, void* out, const size_t size,
- const size_t elem_size) {
- CHECK_MULT_EIGHT(size);
- /* With a bit of care, this could be written such that such that it is */
- /* in_buf = out_buf safe. */
- char* in_b = (char*) in;
- char* out_b = (char*) out;
- size_t nbyte = elem_size * size;
- size_t ii, jj, kk, ind;
- __m256i ymm;
- int32_t bt;
- if (elem_size % 4) {
- return bshuf_shuffle_bit_eightelem_sse2(in, out, size, elem_size);
- } else {
- for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
- for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
- ii += 8 * elem_size) {
- ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
- for (kk = 0; kk < 8; kk++) {
- bt = _mm256_movemask_epi8(ymm);
- ymm = _mm256_slli_epi16(ymm, 1);
- ind = (ii + jj / 8 + (7 - kk) * elem_size);
- * (int32_t *) &out_b[ind] = bt;
- }
- }
- }
- }
- return size * elem_size;
- }
- /* Untranspose bits within elements. */
- int64_t bshuf_untrans_bit_elem_avx2(void* in, void* out, const size_t size,
- const size_t elem_size, void* tmp_buf) {
- int64_t count;
- CHECK_MULT_EIGHT(size);
- count = bshuf_trans_byte_bitrow_avx2(in, tmp_buf, size, elem_size);
- CHECK_ERR(count);
- count = bshuf_shuffle_bit_eightelem_avx2(tmp_buf, out, size, elem_size);
- return count;
- }
|