fastcopy.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. /*********************************************************************
  2. Blosc - Blocked Shuffling and Compression Library
  3. Author: Francesc Alted <francesc@blosc.org>
  4. Creation date: 2018-01-03
  5. See LICENSES/BLOSC.txt for details about copyright and rights to use.
  6. **********************************************************************/
  7. /*********************************************************************
  8. The code in this file is heavily based on memcopy.h, from the
  9. zlib-ng compression library. See LICENSES/ZLIB.txt for details.
  10. See also: https://github.com/Dead2/zlib-ng/blob/develop/zlib.h
  11. New implementations by Francesc Alted:
  12. * get_run() and get_match() familiy of functions
  13. * fast_copy() and safe_copy() functions
  14. * Support for SSE2/AVX2 copy instructions for these routines
  15. **********************************************************************/
  16. #include <assert.h>
  17. #include "blosc-common.h"
  18. static inline unsigned char *copy_1_bytes(unsigned char *out, const unsigned char *from) {
  19. *out++ = *from;
  20. return out;
  21. }
  22. static inline unsigned char *copy_2_bytes(unsigned char *out, const unsigned char *from) {
  23. #if defined(BLOSC_STRICT_ALIGN)
  24. uint16_t chunk;
  25. memcpy(&chunk, from, 2);
  26. memcpy(out, &chunk, 2);
  27. #else
  28. *(uint16_t *) out = *(uint16_t *) from;
  29. #endif
  30. return out + 2;
  31. }
  32. static inline unsigned char *copy_3_bytes(unsigned char *out, const unsigned char *from) {
  33. out = copy_1_bytes(out, from);
  34. return copy_2_bytes(out, from + 1);
  35. }
  36. static inline unsigned char *copy_4_bytes(unsigned char *out, const unsigned char *from) {
  37. #if defined(BLOSC_STRICT_ALIGN)
  38. uint32_t chunk;
  39. memcpy(&chunk, from, 4);
  40. memcpy(out, &chunk, 4);
  41. #else
  42. *(uint32_t *) out = *(uint32_t *) from;
  43. #endif
  44. return out + 4;
  45. }
  46. static inline unsigned char *copy_5_bytes(unsigned char *out, const unsigned char *from) {
  47. out = copy_1_bytes(out, from);
  48. return copy_4_bytes(out, from + 1);
  49. }
  50. static inline unsigned char *copy_6_bytes(unsigned char *out, const unsigned char *from) {
  51. out = copy_2_bytes(out, from);
  52. return copy_4_bytes(out, from + 2);
  53. }
  54. static inline unsigned char *copy_7_bytes(unsigned char *out, const unsigned char *from) {
  55. out = copy_3_bytes(out, from);
  56. return copy_4_bytes(out, from + 3);
  57. }
  58. static inline unsigned char *copy_8_bytes(unsigned char *out, const unsigned char *from) {
  59. #if defined(BLOSC_STRICT_ALIGN)
  60. uint64_t chunk;
  61. memcpy(&chunk, from, 8);
  62. memcpy(out, &chunk, 8);
  63. #else
  64. *(uint64_t *) out = *(uint64_t *) from;
  65. #endif
  66. return out + 8;
  67. }
  68. static inline unsigned char *copy_16_bytes(unsigned char *out, const unsigned char *from) {
  69. #if defined(__SSE2__)
  70. __m128i chunk;
  71. chunk = _mm_loadu_si128((__m128i*)from);
  72. _mm_storeu_si128((__m128i*)out, chunk);
  73. from += 16; out += 16;
  74. #elif !defined(BLOSC_STRICT_ALIGN)
  75. *(uint64_t*)out = *(uint64_t*)from;
  76. from += 8; out += 8;
  77. *(uint64_t*)out = *(uint64_t*)from;
  78. from += 8; out += 8;
  79. #else
  80. int i;
  81. for (i = 0; i < 16; i++) {
  82. *out++ = *from++;
  83. }
  84. #endif
  85. return out;
  86. }
  87. static inline unsigned char *copy_32_bytes(unsigned char *out, const unsigned char *from) {
  88. #if defined(__AVX2__)
  89. __m256i chunk;
  90. chunk = _mm256_loadu_si256((__m256i*)from);
  91. _mm256_storeu_si256((__m256i*)out, chunk);
  92. from += 32; out += 32;
  93. #elif defined(__SSE2__)
  94. __m128i chunk;
  95. chunk = _mm_loadu_si128((__m128i*)from);
  96. _mm_storeu_si128((__m128i*)out, chunk);
  97. from += 16; out += 16;
  98. chunk = _mm_loadu_si128((__m128i*)from);
  99. _mm_storeu_si128((__m128i*)out, chunk);
  100. from += 16; out += 16;
  101. #elif !defined(BLOSC_STRICT_ALIGN)
  102. *(uint64_t*)out = *(uint64_t*)from;
  103. from += 8; out += 8;
  104. *(uint64_t*)out = *(uint64_t*)from;
  105. from += 8; out += 8;
  106. *(uint64_t*)out = *(uint64_t*)from;
  107. from += 8; out += 8;
  108. *(uint64_t*)out = *(uint64_t*)from;
  109. from += 8; out += 8;
  110. #else
  111. int i;
  112. for (i = 0; i < 32; i++) {
  113. *out++ = *from++;
  114. }
  115. #endif
  116. return out;
  117. }
  118. #if defined(__AVX2__)
  119. static inline unsigned char *copy_32_bytes_aligned(unsigned char *out, const unsigned char *from) {
  120. __m256i chunk;
  121. chunk = _mm256_load_si256((__m256i*)from);
  122. _mm256_storeu_si256((__m256i*)out, chunk);
  123. return out + 32;
  124. }
  125. #endif // __AVX2__
  126. /* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */
  127. static inline unsigned char *copy_bytes(unsigned char *out, const unsigned char *from, unsigned len) {
  128. assert(len < 8);
  129. #ifdef BLOSC_STRICT_ALIGN
  130. while (len--) {
  131. *out++ = *from++;
  132. }
  133. #else
  134. switch (len) {
  135. case 7:
  136. return copy_7_bytes(out, from);
  137. case 6:
  138. return copy_6_bytes(out, from);
  139. case 5:
  140. return copy_5_bytes(out, from);
  141. case 4:
  142. return copy_4_bytes(out, from);
  143. case 3:
  144. return copy_3_bytes(out, from);
  145. case 2:
  146. return copy_2_bytes(out, from);
  147. case 1:
  148. return copy_1_bytes(out, from);
  149. case 0:
  150. return out;
  151. default:
  152. assert(0);
  153. }
  154. #endif /* BLOSC_STRICT_ALIGN */
  155. return out;
  156. }
  157. /* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
  158. static inline unsigned char *chunk_memcpy(unsigned char *out, const unsigned char *from, unsigned len) {
  159. unsigned sz = sizeof(uint64_t);
  160. unsigned rem = len % sz;
  161. unsigned by8;
  162. assert(len >= sz);
  163. /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
  164. copy_8_bytes(out, from);
  165. len /= sz;
  166. out += rem;
  167. from += rem;
  168. by8 = len % 8;
  169. len -= by8;
  170. switch (by8) {
  171. case 7:
  172. out = copy_8_bytes(out, from);
  173. from += sz;
  174. case 6:
  175. out = copy_8_bytes(out, from);
  176. from += sz;
  177. case 5:
  178. out = copy_8_bytes(out, from);
  179. from += sz;
  180. case 4:
  181. out = copy_8_bytes(out, from);
  182. from += sz;
  183. case 3:
  184. out = copy_8_bytes(out, from);
  185. from += sz;
  186. case 2:
  187. out = copy_8_bytes(out, from);
  188. from += sz;
  189. case 1:
  190. out = copy_8_bytes(out, from);
  191. from += sz;
  192. default:
  193. break;
  194. }
  195. while (len) {
  196. out = copy_8_bytes(out, from);
  197. from += sz;
  198. out = copy_8_bytes(out, from);
  199. from += sz;
  200. out = copy_8_bytes(out, from);
  201. from += sz;
  202. out = copy_8_bytes(out, from);
  203. from += sz;
  204. out = copy_8_bytes(out, from);
  205. from += sz;
  206. out = copy_8_bytes(out, from);
  207. from += sz;
  208. out = copy_8_bytes(out, from);
  209. from += sz;
  210. out = copy_8_bytes(out, from);
  211. from += sz;
  212. len -= 8;
  213. }
  214. return out;
  215. }
  216. /* 16-byte version of chunk_memcpy() */
  217. static inline unsigned char *chunk_memcpy_16(unsigned char *out, const unsigned char *from, unsigned len) {
  218. unsigned sz = 16;
  219. unsigned rem = len % sz;
  220. unsigned ilen;
  221. assert(len >= sz);
  222. /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
  223. copy_16_bytes(out, from);
  224. len /= sz;
  225. out += rem;
  226. from += rem;
  227. for (ilen = 0; ilen < len; ilen++) {
  228. copy_16_bytes(out, from);
  229. out += sz;
  230. from += sz;
  231. }
  232. return out;
  233. }
  234. /* 32-byte version of chunk_memcpy() */
  235. static inline unsigned char *chunk_memcpy_32(unsigned char *out, const unsigned char *from, unsigned len) {
  236. unsigned sz = 32;
  237. unsigned rem = len % sz;
  238. unsigned ilen;
  239. assert(len >= sz);
  240. /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
  241. copy_32_bytes(out, from);
  242. len /= sz;
  243. out += rem;
  244. from += rem;
  245. for (ilen = 0; ilen < len; ilen++) {
  246. copy_32_bytes(out, from);
  247. out += sz;
  248. from += sz;
  249. }
  250. return out;
  251. }
  252. /* 32-byte *unrolled* version of chunk_memcpy() */
  253. static inline unsigned char *chunk_memcpy_32_unrolled(unsigned char *out, const unsigned char *from, unsigned len) {
  254. unsigned sz = 32;
  255. unsigned rem = len % sz;
  256. unsigned by8;
  257. assert(len >= sz);
  258. /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
  259. copy_32_bytes(out, from);
  260. len /= sz;
  261. out += rem;
  262. from += rem;
  263. by8 = len % 8;
  264. len -= by8;
  265. switch (by8) {
  266. case 7:
  267. out = copy_32_bytes(out, from);
  268. from += sz;
  269. case 6:
  270. out = copy_32_bytes(out, from);
  271. from += sz;
  272. case 5:
  273. out = copy_32_bytes(out, from);
  274. from += sz;
  275. case 4:
  276. out = copy_32_bytes(out, from);
  277. from += sz;
  278. case 3:
  279. out = copy_32_bytes(out, from);
  280. from += sz;
  281. case 2:
  282. out = copy_32_bytes(out, from);
  283. from += sz;
  284. case 1:
  285. out = copy_32_bytes(out, from);
  286. from += sz;
  287. default:
  288. break;
  289. }
  290. while (len) {
  291. out = copy_32_bytes(out, from);
  292. from += sz;
  293. out = copy_32_bytes(out, from);
  294. from += sz;
  295. out = copy_32_bytes(out, from);
  296. from += sz;
  297. out = copy_32_bytes(out, from);
  298. from += sz;
  299. out = copy_32_bytes(out, from);
  300. from += sz;
  301. out = copy_32_bytes(out, from);
  302. from += sz;
  303. out = copy_32_bytes(out, from);
  304. from += sz;
  305. out = copy_32_bytes(out, from);
  306. from += sz;
  307. len -= 8;
  308. }
  309. return out;
  310. }
  311. /* SSE2/AVX2 *unaligned* version of chunk_memcpy() */
  312. #if defined(__SSE2__) || defined(__AVX2__)
  313. static inline unsigned char *chunk_memcpy_unaligned(unsigned char *out, const unsigned char *from, unsigned len) {
  314. #if defined(__AVX2__)
  315. unsigned sz = sizeof(__m256i);
  316. #elif defined(__SSE2__)
  317. unsigned sz = sizeof(__m128i);
  318. #endif
  319. unsigned rem = len % sz;
  320. unsigned ilen;
  321. assert(len >= sz);
  322. /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
  323. #if defined(__AVX2__)
  324. copy_32_bytes(out, from);
  325. #elif defined(__SSE2__)
  326. copy_16_bytes(out, from);
  327. #endif
  328. len /= sz;
  329. out += rem;
  330. from += rem;
  331. for (ilen = 0; ilen < len; ilen++) {
  332. #if defined(__AVX2__)
  333. copy_32_bytes(out, from);
  334. #elif defined(__SSE2__)
  335. copy_16_bytes(out, from);
  336. #endif
  337. out += sz;
  338. from += sz;
  339. }
  340. return out;
  341. }
  342. #endif // __SSE2__ || __AVX2__
  343. #if defined(__SSE2__) || defined(__AVX2__)
  344. /* SSE2/AVX2 *aligned* version of chunk_memcpy() */
  345. static inline unsigned char *chunk_memcpy_aligned(unsigned char *out, const unsigned char *from, unsigned len) {
  346. #if defined(__AVX2__)
  347. unsigned sz = sizeof(__m256i);
  348. __m256i chunk;
  349. #elif defined(__SSE2__)
  350. unsigned sz = sizeof(__m128i);
  351. __m128i chunk;
  352. #endif
  353. unsigned bytes_to_align = sz - (unsigned)(((uintptr_t)(const void *)(from)) % sz);
  354. unsigned corrected_len = len - bytes_to_align;
  355. unsigned rem = corrected_len % sz;
  356. unsigned ilen;
  357. assert(len >= sz);
  358. /* Copy a few bytes to make sure the loop below has aligned access. */
  359. #if defined(__AVX2__)
  360. chunk = _mm256_loadu_si256((__m256i *) from);
  361. _mm256_storeu_si256((__m256i *) out, chunk);
  362. #elif defined(__SSE2__)
  363. chunk = _mm_loadu_si128((__m128i *) from);
  364. _mm_storeu_si128((__m128i *) out, chunk);
  365. #endif
  366. out += bytes_to_align;
  367. from += bytes_to_align;
  368. len = corrected_len / sz;
  369. for (ilen = 0; ilen < len; ilen++) {
  370. #if defined(__AVX2__)
  371. chunk = _mm256_load_si256((__m256i *) from); /* *aligned* load */
  372. _mm256_storeu_si256((__m256i *) out, chunk);
  373. #elif defined(__SSE2__)
  374. chunk = _mm_load_si128((__m128i *) from); /* *aligned* load */
  375. _mm_storeu_si128((__m128i *) out, chunk);
  376. #endif
  377. out += sz;
  378. from += sz;
  379. }
  380. /* Copy remaining bytes */
  381. if (rem < 8) {
  382. out = copy_bytes(out, from, rem);
  383. }
  384. else {
  385. out = chunk_memcpy(out, from, rem);
  386. }
  387. return out;
  388. }
  389. #endif // __AVX2__ || __SSE2__
  390. /* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
  391. unsigned char *fastcopy(unsigned char *out, const unsigned char *from, unsigned len) {
  392. switch (len) {
  393. case 32:
  394. return copy_32_bytes(out, from);
  395. case 16:
  396. return copy_16_bytes(out, from);
  397. case 8:
  398. return copy_8_bytes(out, from);
  399. default: {
  400. }
  401. }
  402. if (len < 8) {
  403. return copy_bytes(out, from, len);
  404. }
  405. #if defined(__SSE2__)
  406. if (len < 16) {
  407. return chunk_memcpy(out, from, len);
  408. }
  409. #if !defined(__AVX2__)
  410. return chunk_memcpy_unaligned(out, from, len);
  411. #else
  412. if (len < 32) {
  413. return chunk_memcpy_16(out, from, len);
  414. }
  415. return chunk_memcpy_unaligned(out, from, len);
  416. #endif // !__AVX2__
  417. #endif // __SSE2__
  418. return chunk_memcpy(out, from, len);
  419. }
  420. /* Same as fastcopy() but without overwriting origin or destination when they overlap */
  421. unsigned char* safecopy(unsigned char *out, const unsigned char *from, unsigned len) {
  422. #if defined(__AVX2__)
  423. unsigned sz = sizeof(__m256i);
  424. #elif defined(__SSE2__)
  425. unsigned sz = sizeof(__m128i);
  426. #else
  427. unsigned sz = sizeof(uint64_t);
  428. #endif
  429. if (out - sz < from) {
  430. for (; len; --len) {
  431. *out++ = *from++;
  432. }
  433. return out;
  434. }
  435. else {
  436. return fastcopy(out, from, len);
  437. }
  438. }