shuffle-sse2.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. /*********************************************************************
  2. Blosc - Blocked Shuffling and Compression Library
  3. Author: Francesc Alted <francesc@blosc.org>
  4. See LICENSES/BLOSC.txt for details about copyright and rights to use.
  5. **********************************************************************/
  6. #include "shuffle-generic.h"
  7. #include "shuffle-sse2.h"
  8. /* Make sure SSE2 is available for the compilation target and compiler. */
  9. #if !defined(__SSE2__)
  10. #error SSE2 is not supported by the target architecture/platform and/or this compiler.
  11. #endif
  12. #include <emmintrin.h>
  13. /* The next is useful for debugging purposes */
  14. #if 0
  15. #include <stdio.h>
  16. #include <string.h>
  17. static void printxmm(__m128i xmm0)
  18. {
  19. uint8_t buf[16];
  20. ((__m128i *)buf)[0] = xmm0;
  21. printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
  22. buf[0], buf[1], buf[2], buf[3],
  23. buf[4], buf[5], buf[6], buf[7],
  24. buf[8], buf[9], buf[10], buf[11],
  25. buf[12], buf[13], buf[14], buf[15]);
  26. }
  27. #endif
  28. /* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
  29. static void
  30. shuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
  31. const size_t vectorizable_elements, const size_t total_elements)
  32. {
  33. static const size_t bytesoftype = 2;
  34. size_t j;
  35. int k;
  36. uint8_t* dest_for_jth_element;
  37. __m128i xmm0[2], xmm1[2];
  38. for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
  39. /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */
  40. for (k = 0; k < 2; k++) {
  41. xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
  42. xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
  43. xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
  44. xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
  45. xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
  46. xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
  47. xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
  48. xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
  49. xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
  50. xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
  51. }
  52. /* Transpose quad words */
  53. for (k = 0; k < 1; k++) {
  54. xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]);
  55. xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]);
  56. }
  57. /* Store the result vectors */
  58. dest_for_jth_element = dest + j;
  59. for (k = 0; k < 2; k++) {
  60. _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]);
  61. }
  62. }
  63. }
  64. /* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
  65. static void
  66. shuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
  67. const size_t vectorizable_elements, const size_t total_elements)
  68. {
  69. static const size_t bytesoftype = 4;
  70. size_t i;
  71. int j;
  72. uint8_t* dest_for_ith_element;
  73. __m128i xmm0[4], xmm1[4];
  74. for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
  75. /* Fetch 16 elements (64 bytes) then transpose bytes and words. */
  76. for (j = 0; j < 4; j++) {
  77. xmm0[j] = _mm_loadu_si128((__m128i*)(src + (i * bytesoftype) + (j * sizeof(__m128i))));
  78. xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0xd8);
  79. xmm0[j] = _mm_shuffle_epi32(xmm0[j], 0x8d);
  80. xmm0[j] = _mm_unpacklo_epi8(xmm1[j], xmm0[j]);
  81. xmm1[j] = _mm_shuffle_epi32(xmm0[j], 0x04e);
  82. xmm0[j] = _mm_unpacklo_epi16(xmm0[j], xmm1[j]);
  83. }
  84. /* Transpose double words */
  85. for (j = 0; j < 2; j++) {
  86. xmm1[j*2] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]);
  87. xmm1[j*2+1] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]);
  88. }
  89. /* Transpose quad words */
  90. for (j = 0; j < 2; j++) {
  91. xmm0[j*2] = _mm_unpacklo_epi64(xmm1[j], xmm1[j+2]);
  92. xmm0[j*2+1] = _mm_unpackhi_epi64(xmm1[j], xmm1[j+2]);
  93. }
  94. /* Store the result vectors */
  95. dest_for_ith_element = dest + i;
  96. for (j = 0; j < 4; j++) {
  97. _mm_storeu_si128((__m128i*)(dest_for_ith_element + (j * total_elements)), xmm0[j]);
  98. }
  99. }
  100. }
  101. /* Routine optimized for shuffling a buffer for a type size of 8 bytes. */
  102. static void
  103. shuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
  104. const size_t vectorizable_elements, const size_t total_elements)
  105. {
  106. static const size_t bytesoftype = 8;
  107. size_t j;
  108. int k, l;
  109. uint8_t* dest_for_jth_element;
  110. __m128i xmm0[8], xmm1[8];
  111. for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
  112. /* Fetch 16 elements (128 bytes) then transpose bytes. */
  113. for (k = 0; k < 8; k++) {
  114. xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
  115. xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
  116. xmm1[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
  117. }
  118. /* Transpose words */
  119. for (k = 0, l = 0; k < 4; k++, l +=2) {
  120. xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+1]);
  121. xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+1]);
  122. }
  123. /* Transpose double words */
  124. for (k = 0, l = 0; k < 4; k++, l++) {
  125. if (k == 2) l += 2;
  126. xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+2]);
  127. xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+2]);
  128. }
  129. /* Transpose quad words */
  130. for (k = 0; k < 4; k++) {
  131. xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+4]);
  132. xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+4]);
  133. }
  134. /* Store the result vectors */
  135. dest_for_jth_element = dest + j;
  136. for (k = 0; k < 8; k++) {
  137. _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
  138. }
  139. }
  140. }
  141. /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
  142. static void
  143. shuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
  144. const size_t vectorizable_elements, const size_t total_elements)
  145. {
  146. static const size_t bytesoftype = 16;
  147. size_t j;
  148. int k, l;
  149. uint8_t* dest_for_jth_element;
  150. __m128i xmm0[16], xmm1[16];
  151. for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
  152. /* Fetch 16 elements (256 bytes). */
  153. for (k = 0; k < 16; k++) {
  154. xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
  155. }
  156. /* Transpose bytes */
  157. for (k = 0, l = 0; k < 8; k++, l +=2) {
  158. xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]);
  159. xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]);
  160. }
  161. /* Transpose words */
  162. for (k = 0, l = -2; k < 8; k++, l++) {
  163. if ((k%2) == 0) l += 2;
  164. xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]);
  165. xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]);
  166. }
  167. /* Transpose double words */
  168. for (k = 0, l = -4; k < 8; k++, l++) {
  169. if ((k%4) == 0) l += 4;
  170. xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]);
  171. xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]);
  172. }
  173. /* Transpose quad words */
  174. for (k = 0; k < 8; k++) {
  175. xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]);
  176. xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]);
  177. }
  178. /* Store the result vectors */
  179. dest_for_jth_element = dest + j;
  180. for (k = 0; k < 16; k++) {
  181. _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
  182. }
  183. }
  184. }
  185. /* Routine optimized for shuffling a buffer for a type size larger than 16 bytes. */
  186. static void
  187. shuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const src,
  188. const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
  189. {
  190. size_t j;
  191. const size_t vecs_per_el_rem = bytesoftype % sizeof(__m128i);
  192. int k, l;
  193. uint8_t* dest_for_jth_element;
  194. __m128i xmm0[16], xmm1[16];
  195. for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
  196. /* Advance the offset into the type by the vector size (in bytes), unless this is
  197. the initial iteration and the type size is not a multiple of the vector size.
  198. In that case, only advance by the number of bytes necessary so that the number
  199. of remaining bytes in the type will be a multiple of the vector size. */
  200. size_t offset_into_type;
  201. for (offset_into_type = 0; offset_into_type < bytesoftype;
  202. offset_into_type += (offset_into_type == 0 && vecs_per_el_rem > 0 ? vecs_per_el_rem : sizeof(__m128i))) {
  203. /* Fetch elements in groups of 256 bytes */
  204. const uint8_t* const src_with_offset = src + offset_into_type;
  205. for (k = 0; k < 16; k++) {
  206. xmm0[k] = _mm_loadu_si128((__m128i*)(src_with_offset + (j + k) * bytesoftype));
  207. }
  208. /* Transpose bytes */
  209. for (k = 0, l = 0; k < 8; k++, l +=2) {
  210. xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]);
  211. xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]);
  212. }
  213. /* Transpose words */
  214. for (k = 0, l = -2; k < 8; k++, l++) {
  215. if ((k%2) == 0) l += 2;
  216. xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]);
  217. xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]);
  218. }
  219. /* Transpose double words */
  220. for (k = 0, l = -4; k < 8; k++, l++) {
  221. if ((k%4) == 0) l += 4;
  222. xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]);
  223. xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]);
  224. }
  225. /* Transpose quad words */
  226. for (k = 0; k < 8; k++) {
  227. xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]);
  228. xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]);
  229. }
  230. /* Store the result vectors */
  231. dest_for_jth_element = dest + j;
  232. for (k = 0; k < 16; k++) {
  233. _mm_storeu_si128((__m128i*)(dest_for_jth_element + (total_elements * (offset_into_type + k))), xmm0[k]);
  234. }
  235. }
  236. }
  237. }
  238. /* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
  239. static void
  240. unshuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
  241. const size_t vectorizable_elements, const size_t total_elements)
  242. {
  243. static const size_t bytesoftype = 2;
  244. size_t i;
  245. int j;
  246. __m128i xmm0[2], xmm1[2];
  247. for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
  248. /* Load 16 elements (32 bytes) into 2 XMM registers. */
  249. const uint8_t* const src_for_ith_element = src + i;
  250. for (j = 0; j < 2; j++) {
  251. xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
  252. }
  253. /* Shuffle bytes */
  254. /* Compute the low 32 bytes */
  255. xmm1[0] = _mm_unpacklo_epi8(xmm0[0], xmm0[1]);
  256. /* Compute the hi 32 bytes */
  257. xmm1[1] = _mm_unpackhi_epi8(xmm0[0], xmm0[1]);
  258. /* Store the result vectors in proper order */
  259. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
  260. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[1]);
  261. }
  262. }
  263. /* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
  264. static void
  265. unshuffle4_sse2(uint8_t* const dest, const uint8_t* const src,
  266. const size_t vectorizable_elements, const size_t total_elements)
  267. {
  268. static const size_t bytesoftype = 4;
  269. size_t i;
  270. int j;
  271. __m128i xmm0[4], xmm1[4];
  272. for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
  273. /* Load 16 elements (64 bytes) into 4 XMM registers. */
  274. const uint8_t* const src_for_ith_element = src + i;
  275. for (j = 0; j < 4; j++) {
  276. xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
  277. }
  278. /* Shuffle bytes */
  279. for (j = 0; j < 2; j++) {
  280. /* Compute the low 32 bytes */
  281. xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
  282. /* Compute the hi 32 bytes */
  283. xmm1[2+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
  284. }
  285. /* Shuffle 2-byte words */
  286. for (j = 0; j < 2; j++) {
  287. /* Compute the low 32 bytes */
  288. xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
  289. /* Compute the hi 32 bytes */
  290. xmm0[2+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
  291. }
  292. /* Store the result vectors in proper order */
  293. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm0[0]);
  294. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm0[2]);
  295. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm0[1]);
  296. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm0[3]);
  297. }
  298. }
  299. /* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
  300. static void
  301. unshuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
  302. const size_t vectorizable_elements, const size_t total_elements)
  303. {
  304. static const size_t bytesoftype = 8;
  305. size_t i;
  306. int j;
  307. __m128i xmm0[8], xmm1[8];
  308. for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
  309. /* Load 16 elements (128 bytes) into 8 XMM registers. */
  310. const uint8_t* const src_for_ith_element = src + i;
  311. for (j = 0; j < 8; j++) {
  312. xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
  313. }
  314. /* Shuffle bytes */
  315. for (j = 0; j < 4; j++) {
  316. /* Compute the low 32 bytes */
  317. xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
  318. /* Compute the hi 32 bytes */
  319. xmm1[4+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
  320. }
  321. /* Shuffle 2-byte words */
  322. for (j = 0; j < 4; j++) {
  323. /* Compute the low 32 bytes */
  324. xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
  325. /* Compute the hi 32 bytes */
  326. xmm0[4+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
  327. }
  328. /* Shuffle 4-byte dwords */
  329. for (j = 0; j < 4; j++) {
  330. /* Compute the low 32 bytes */
  331. xmm1[j] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]);
  332. /* Compute the hi 32 bytes */
  333. xmm1[4+j] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]);
  334. }
  335. /* Store the result vectors in proper order */
  336. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
  337. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[4]);
  338. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[2]);
  339. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[6]);
  340. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[1]);
  341. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[5]);
  342. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[3]);
  343. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[7]);
  344. }
  345. }
  346. /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
  347. static void
  348. unshuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
  349. const size_t vectorizable_elements, const size_t total_elements)
  350. {
  351. static const size_t bytesoftype = 16;
  352. size_t i;
  353. int j;
  354. __m128i xmm1[16], xmm2[16];
  355. for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
  356. /* Load 16 elements (256 bytes) into 16 XMM registers. */
  357. const uint8_t* const src_for_ith_element = src + i;
  358. for (j = 0; j < 16; j++) {
  359. xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
  360. }
  361. /* Shuffle bytes */
  362. for (j = 0; j < 8; j++) {
  363. /* Compute the low 32 bytes */
  364. xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
  365. /* Compute the hi 32 bytes */
  366. xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
  367. }
  368. /* Shuffle 2-byte words */
  369. for (j = 0; j < 8; j++) {
  370. /* Compute the low 32 bytes */
  371. xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
  372. /* Compute the hi 32 bytes */
  373. xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
  374. }
  375. /* Shuffle 4-byte dwords */
  376. for (j = 0; j < 8; j++) {
  377. /* Compute the low 32 bytes */
  378. xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
  379. /* Compute the hi 32 bytes */
  380. xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
  381. }
  382. /* Shuffle 8-byte qwords */
  383. for (j = 0; j < 8; j++) {
  384. /* Compute the low 32 bytes */
  385. xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
  386. /* Compute the hi 32 bytes */
  387. xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
  388. }
  389. /* Store the result vectors in proper order */
  390. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
  391. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[8]);
  392. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[4]);
  393. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[12]);
  394. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[2]);
  395. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[10]);
  396. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[6]);
  397. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[14]);
  398. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (8 * sizeof(__m128i))), xmm1[1]);
  399. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (9 * sizeof(__m128i))), xmm1[9]);
  400. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (10 * sizeof(__m128i))), xmm1[5]);
  401. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (11 * sizeof(__m128i))), xmm1[13]);
  402. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (12 * sizeof(__m128i))), xmm1[3]);
  403. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (13 * sizeof(__m128i))), xmm1[11]);
  404. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (14 * sizeof(__m128i))), xmm1[7]);
  405. _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (15 * sizeof(__m128i))), xmm1[15]);
  406. }
  407. }
  408. /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
  409. static void
  410. unshuffle16_tiled_sse2(uint8_t* const dest, const uint8_t* const orig,
  411. const size_t vectorizable_elements, const size_t total_elements, const size_t bytesoftype)
  412. {
  413. size_t i;
  414. const size_t vecs_per_el_rem = bytesoftype % sizeof(__m128i);
  415. int j;
  416. uint8_t* dest_with_offset;
  417. __m128i xmm1[16], xmm2[16];
  418. /* The unshuffle loops are inverted (compared to shuffle_tiled16_sse2)
  419. to optimize cache utilization. */
  420. size_t offset_into_type;
  421. for (offset_into_type = 0; offset_into_type < bytesoftype;
  422. offset_into_type += (offset_into_type == 0 && vecs_per_el_rem > 0 ? vecs_per_el_rem : sizeof(__m128i))) {
  423. for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
  424. /* Load the first 128 bytes in 16 XMM registers */
  425. const uint8_t* const src_for_ith_element = orig + i;
  426. for (j = 0; j < 16; j++) {
  427. xmm1[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (total_elements * (offset_into_type + j))));
  428. }
  429. /* Shuffle bytes */
  430. for (j = 0; j < 8; j++) {
  431. /* Compute the low 32 bytes */
  432. xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
  433. /* Compute the hi 32 bytes */
  434. xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
  435. }
  436. /* Shuffle 2-byte words */
  437. for (j = 0; j < 8; j++) {
  438. /* Compute the low 32 bytes */
  439. xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
  440. /* Compute the hi 32 bytes */
  441. xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
  442. }
  443. /* Shuffle 4-byte dwords */
  444. for (j = 0; j < 8; j++) {
  445. /* Compute the low 32 bytes */
  446. xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
  447. /* Compute the hi 32 bytes */
  448. xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
  449. }
  450. /* Shuffle 8-byte qwords */
  451. for (j = 0; j < 8; j++) {
  452. /* Compute the low 32 bytes */
  453. xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
  454. /* Compute the hi 32 bytes */
  455. xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
  456. }
  457. /* Store the result vectors in proper order */
  458. dest_with_offset = dest + offset_into_type;
  459. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 0) * bytesoftype), xmm1[0]);
  460. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 1) * bytesoftype), xmm1[8]);
  461. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 2) * bytesoftype), xmm1[4]);
  462. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 3) * bytesoftype), xmm1[12]);
  463. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 4) * bytesoftype), xmm1[2]);
  464. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 5) * bytesoftype), xmm1[10]);
  465. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 6) * bytesoftype), xmm1[6]);
  466. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 7) * bytesoftype), xmm1[14]);
  467. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 8) * bytesoftype), xmm1[1]);
  468. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 9) * bytesoftype), xmm1[9]);
  469. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 10) * bytesoftype), xmm1[5]);
  470. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 11) * bytesoftype), xmm1[13]);
  471. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 12) * bytesoftype), xmm1[3]);
  472. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 13) * bytesoftype), xmm1[11]);
  473. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 14) * bytesoftype), xmm1[7]);
  474. _mm_storeu_si128((__m128i*)(dest_with_offset + (i + 15) * bytesoftype), xmm1[15]);
  475. }
  476. }
  477. }
  478. /* Shuffle a block. This can never fail. */
  479. void
  480. shuffle_sse2(const size_t bytesoftype, const size_t blocksize,
  481. const uint8_t* const _src, uint8_t* const _dest) {
  482. const size_t vectorized_chunk_size = bytesoftype * sizeof(__m128i);
  483. /* If the blocksize is not a multiple of both the typesize and
  484. the vector size, round the blocksize down to the next value
  485. which is a multiple of both. The vectorized shuffle can be
  486. used for that portion of the data, and the naive implementation
  487. can be used for the remaining portion. */
  488. const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
  489. const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
  490. const size_t total_elements = blocksize / bytesoftype;
  491. /* If the block size is too small to be vectorized,
  492. use the generic implementation. */
  493. if (blocksize < vectorized_chunk_size) {
  494. shuffle_generic(bytesoftype, blocksize, _src, _dest);
  495. return;
  496. }
  497. /* Optimized shuffle implementations */
  498. switch (bytesoftype)
  499. {
  500. case 2:
  501. shuffle2_sse2(_dest, _src, vectorizable_elements, total_elements);
  502. break;
  503. case 4:
  504. shuffle4_sse2(_dest, _src, vectorizable_elements, total_elements);
  505. break;
  506. case 8:
  507. shuffle8_sse2(_dest, _src, vectorizable_elements, total_elements);
  508. break;
  509. case 16:
  510. shuffle16_sse2(_dest, _src, vectorizable_elements, total_elements);
  511. break;
  512. default:
  513. if (bytesoftype > sizeof(__m128i)) {
  514. shuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
  515. }
  516. else {
  517. /* Non-optimized shuffle */
  518. shuffle_generic(bytesoftype, blocksize, _src, _dest);
  519. /* The non-optimized function covers the whole buffer,
  520. so we're done processing here. */
  521. return;
  522. }
  523. }
  524. /* If the buffer had any bytes at the end which couldn't be handled
  525. by the vectorized implementations, use the non-optimized version
  526. to finish them up. */
  527. if (vectorizable_bytes < blocksize) {
  528. shuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
  529. }
  530. }
  531. /* Unshuffle a block. This can never fail. */
  532. void
  533. unshuffle_sse2(const size_t bytesoftype, const size_t blocksize,
  534. const uint8_t* const _src, uint8_t* const _dest) {
  535. const size_t vectorized_chunk_size = bytesoftype * sizeof(__m128i);
  536. /* If the blocksize is not a multiple of both the typesize and
  537. the vector size, round the blocksize down to the next value
  538. which is a multiple of both. The vectorized unshuffle can be
  539. used for that portion of the data, and the naive implementation
  540. can be used for the remaining portion. */
  541. const size_t vectorizable_bytes = blocksize - (blocksize % vectorized_chunk_size);
  542. const size_t vectorizable_elements = vectorizable_bytes / bytesoftype;
  543. const size_t total_elements = blocksize / bytesoftype;
  544. /* If the block size is too small to be vectorized,
  545. use the generic implementation. */
  546. if (blocksize < vectorized_chunk_size) {
  547. unshuffle_generic(bytesoftype, blocksize, _src, _dest);
  548. return;
  549. }
  550. /* Optimized unshuffle implementations */
  551. switch (bytesoftype)
  552. {
  553. case 2:
  554. unshuffle2_sse2(_dest, _src, vectorizable_elements, total_elements);
  555. break;
  556. case 4:
  557. unshuffle4_sse2(_dest, _src, vectorizable_elements, total_elements);
  558. break;
  559. case 8:
  560. unshuffle8_sse2(_dest, _src, vectorizable_elements, total_elements);
  561. break;
  562. case 16:
  563. unshuffle16_sse2(_dest, _src, vectorizable_elements, total_elements);
  564. break;
  565. default:
  566. if (bytesoftype > sizeof(__m128i)) {
  567. unshuffle16_tiled_sse2(_dest, _src, vectorizable_elements, total_elements, bytesoftype);
  568. }
  569. else {
  570. /* Non-optimized unshuffle */
  571. unshuffle_generic(bytesoftype, blocksize, _src, _dest);
  572. /* The non-optimized function covers the whole buffer,
  573. so we're done processing here. */
  574. return;
  575. }
  576. }
  577. /* If the buffer had any bytes at the end which couldn't be handled
  578. by the vectorized implementations, use the non-optimized version
  579. to finish them up. */
  580. if (vectorizable_bytes < blocksize) {
  581. unshuffle_generic_inline(bytesoftype, vectorizable_bytes, blocksize, _src, _dest);
  582. }
  583. }