blosclz.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528
  1. /*********************************************************************
  2. Blosc - Blocked Shuffling and Compression Library
  3. Author: Francesc Alted <francesc@blosc.org>
  4. Creation date: 2009-05-20
  5. See LICENSES/BLOSC.txt for details about copyright and rights to use.
  6. **********************************************************************/
  7. /*********************************************************************
  8. The code in this file is heavily based on FastLZ, a lightning-fast
  9. lossless compression library. See LICENSES/FASTLZ.txt for details.
  10. **********************************************************************/
  11. #include <stdio.h>
  12. #include <stdlib.h>
  13. #include "blosclz.h"
  14. #include "fastcopy.h"
  15. #include "blosc-common.h"
  16. /*
  17. * Check for bound when decompressing.
  18. * It is a good idea to define this while developing.
  19. */
  20. #undef BLOSCLZ_SAFE
  21. /*
  22. * Give hints to the compiler for branch prediction optimization.
  23. */
  24. #if defined(__GNUC__) && (__GNUC__ > 2)
  25. #define BLOSCLZ_EXPECT_CONDITIONAL(c) (__builtin_expect((c), 1))
  26. #define BLOSCLZ_UNEXPECT_CONDITIONAL(c) (__builtin_expect((c), 0))
  27. #else
  28. #define BLOSCLZ_EXPECT_CONDITIONAL(c) (c)
  29. #define BLOSCLZ_UNEXPECT_CONDITIONAL(c) (c)
  30. #endif
  31. /*
  32. * Use inlined functions for supported systems.
  33. */
  34. #if defined(_MSC_VER) && !defined(__cplusplus) /* Visual Studio */
  35. #define inline __inline /* Visual C is not C99, but supports some kind of inline */
  36. #endif
  37. #define MAX_COPY 32
  38. #define MAX_DISTANCE 8191
  39. #define MAX_FARDISTANCE (65535 + MAX_DISTANCE - 1)
  40. #ifdef BLOSC_STRICT_ALIGN
  41. #define BLOSCLZ_READU16(p) ((p)[0] | (p)[1]<<8)
  42. #else
  43. #define BLOSCLZ_READU16(p) *((const uint16_t*)(p))
  44. #endif
  45. /* Simple, but pretty effective hash function for 3-byte sequence */
  46. #define HASH_FUNCTION(v, p, l) { \
  47. v = BLOSCLZ_READU16(p); \
  48. v ^= BLOSCLZ_READU16(p + 1) ^ ( v >> (16 - l)); \
  49. v &= (1 << l) - 1; \
  50. }
  51. #define LITERAL(ip, op, op_limit, anchor, copy) { \
  52. if (BLOSCLZ_UNEXPECT_CONDITIONAL(op + 2 > op_limit)) \
  53. goto out; \
  54. *op++ = *anchor++; \
  55. ip = anchor; \
  56. copy++; \
  57. if(BLOSCLZ_UNEXPECT_CONDITIONAL(copy == MAX_COPY)) { \
  58. copy = 0; \
  59. *op++ = MAX_COPY-1; \
  60. } \
  61. continue; \
  62. }
  63. #define IP_BOUNDARY 2
  64. static inline uint8_t *get_run(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
  65. uint8_t x = ip[-1];
  66. int64_t value, value2;
  67. /* Broadcast the value for every byte in a 64-bit register */
  68. memset(&value, x, 8);
  69. /* safe because the outer check against ip limit */
  70. while (ip < (ip_bound - sizeof(int64_t))) {
  71. #if defined(BLOSC_STRICT_ALIGN)
  72. memcpy(&value2, ref, 8);
  73. #else
  74. value2 = ((int64_t*)ref)[0];
  75. #endif
  76. if (value != value2) {
  77. /* Find the byte that starts to differ */
  78. while (*ref++ == x) ip++;
  79. return ip;
  80. }
  81. else {
  82. ip += 8;
  83. ref += 8;
  84. }
  85. }
  86. /* Look into the remainder */
  87. while ((ip < ip_bound) && (*ref++ == x)) ip++;
  88. return ip;
  89. }
  90. #ifdef __SSE2__
  91. static inline uint8_t *get_run_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
  92. uint8_t x = ip[-1];
  93. __m128i value, value2, cmp;
  94. /* Broadcast the value for every byte in a 128-bit register */
  95. memset(&value, x, sizeof(__m128i));
  96. /* safe because the outer check against ip limit */
  97. while (ip < (ip_bound - sizeof(__m128i))) {
  98. value2 = _mm_loadu_si128((__m128i *)ref);
  99. cmp = _mm_cmpeq_epi32(value, value2);
  100. if (_mm_movemask_epi8(cmp) != 0xFFFF) {
  101. /* Find the byte that starts to differ */
  102. while (*ref++ == x) ip++;
  103. return ip;
  104. }
  105. else {
  106. ip += sizeof(__m128i);
  107. ref += sizeof(__m128i);
  108. }
  109. }
  110. /* Look into the remainder */
  111. while ((ip < ip_bound) && (*ref++ == x)) ip++;
  112. return ip;
  113. }
  114. #endif
  115. #ifdef __AVX2__
  116. static inline uint8_t *get_run_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
  117. uint8_t x = ip[-1];
  118. __m256i value, value2, cmp;
  119. /* Broadcast the value for every byte in a 256-bit register */
  120. memset(&value, x, sizeof(__m256i));
  121. /* safe because the outer check against ip limit */
  122. while (ip < (ip_bound - (sizeof(__m256i)))) {
  123. value2 = _mm256_loadu_si256((__m256i *)ref);
  124. cmp = _mm256_cmpeq_epi64(value, value2);
  125. if (_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
  126. /* Find the byte that starts to differ */
  127. while (*ref++ == x) ip++;
  128. return ip;
  129. }
  130. else {
  131. ip += sizeof(__m256i);
  132. ref += sizeof(__m256i);
  133. }
  134. }
  135. /* Look into the remainder */
  136. while ((ip < ip_bound) && (*ref++ == x)) ip++;
  137. return ip;
  138. }
  139. #endif
  140. /* Find the byte that starts to differ */
  141. uint8_t *get_match(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
  142. #if !defined(BLOSC_STRICT_ALIGN)
  143. while (ip < (ip_bound - sizeof(int64_t))) {
  144. if (((int64_t*)ref)[0] != ((int64_t*)ip)[0]) {
  145. /* Find the byte that starts to differ */
  146. while (*ref++ == *ip++) {}
  147. return ip;
  148. }
  149. else {
  150. ip += sizeof(int64_t);
  151. ref += sizeof(int64_t);
  152. }
  153. }
  154. #endif
  155. /* Look into the remainder */
  156. while ((ip < ip_bound) && (*ref++ == *ip++)) {}
  157. return ip;
  158. }
  159. #if defined(__SSE2__)
  160. uint8_t *get_match_16(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
  161. __m128i value, value2, cmp;
  162. while (ip < (ip_bound - sizeof(__m128i))) {
  163. value = _mm_loadu_si128((__m128i *) ip);
  164. value2 = _mm_loadu_si128((__m128i *) ref);
  165. cmp = _mm_cmpeq_epi32(value, value2);
  166. if (_mm_movemask_epi8(cmp) != 0xFFFF) {
  167. /* Find the byte that starts to differ */
  168. while (*ref++ == *ip++) {}
  169. return ip;
  170. }
  171. else {
  172. ip += sizeof(__m128i);
  173. ref += sizeof(__m128i);
  174. }
  175. }
  176. /* Look into the remainder */
  177. while ((ip < ip_bound) && (*ref++ == *ip++)) {}
  178. return ip;
  179. }
  180. #endif
  181. #if defined(__AVX2__)
  182. uint8_t *get_match_32(uint8_t *ip, const uint8_t *ip_bound, const uint8_t *ref) {
  183. __m256i value, value2, cmp;
  184. while (ip < (ip_bound - sizeof(__m256i))) {
  185. value = _mm256_loadu_si256((__m256i *) ip);
  186. value2 = _mm256_loadu_si256((__m256i *)ref);
  187. cmp = _mm256_cmpeq_epi64(value, value2);
  188. if (_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) {
  189. /* Find the byte that starts to differ */
  190. while (*ref++ == *ip++) {}
  191. return ip;
  192. }
  193. else {
  194. ip += sizeof(__m256i);
  195. ref += sizeof(__m256i);
  196. }
  197. }
  198. /* Look into the remainder */
  199. while ((ip < ip_bound) && (*ref++ == *ip++)) {}
  200. return ip;
  201. }
  202. #endif
  203. int blosclz_compress(const int opt_level, const void* input, int length,
  204. void* output, int maxout) {
  205. uint8_t* ip = (uint8_t*)input;
  206. uint8_t* ibase = (uint8_t*)input;
  207. uint8_t* ip_bound = ip + length - IP_BOUNDARY;
  208. uint8_t* ip_limit = ip + length - 12;
  209. uint8_t* op = (uint8_t*)output;
  210. /* Hash table depends on the opt level. Hash_log cannot be larger than 15. */
  211. /* The parametrization below is made from playing with the bench suite, like:
  212. $ bench/bench blosclz single 4
  213. $ bench/bench blosclz single 4 4194280 12 25
  214. and taking the minimum times on a i5-3380M @ 2.90GHz.
  215. Curiously enough, values >= 14 does not always
  216. get maximum compression, even with large blocksizes. */
  217. int8_t hash_log_[10] = {-1, 15, 15, 15, 15, 15, 15, 15, 15, 15};
  218. uint8_t hash_log = hash_log_[opt_level];
  219. uint16_t hash_size = 1 << hash_log;
  220. uint16_t* htab;
  221. uint8_t* op_limit;
  222. int32_t hval;
  223. uint8_t copy;
  224. double maxlength_[10] = {-1, .1, .3, .5, .6, .8, .9, .95, 1.0, 1.0};
  225. int32_t maxlength = (int32_t)(length * maxlength_[opt_level]);
  226. if (maxlength > (int32_t)maxout) {
  227. maxlength = (int32_t)maxout;
  228. }
  229. op_limit = op + maxlength;
  230. /* output buffer cannot be less than 66 bytes or we can get into trouble */
  231. if (BLOSCLZ_UNEXPECT_CONDITIONAL(maxout < 66 || length < 4)) {
  232. return 0;
  233. }
  234. htab = (uint16_t*)calloc(hash_size, sizeof(uint16_t));
  235. /* we start with literal copy */
  236. copy = 2;
  237. *op++ = MAX_COPY - 1;
  238. *op++ = *ip++;
  239. *op++ = *ip++;
  240. /* main loop */
  241. while (BLOSCLZ_EXPECT_CONDITIONAL(ip < ip_limit)) {
  242. const uint8_t* ref;
  243. int32_t distance;
  244. int32_t len = 3; /* minimum match length */
  245. uint8_t* anchor = ip; /* comparison starting-point */
  246. /* check for a run */
  247. if (ip[0] == ip[-1] && BLOSCLZ_READU16(ip - 1) == BLOSCLZ_READU16(ip + 1)) {
  248. distance = 1;
  249. ip += 3;
  250. ref = anchor - 1 + 3;
  251. goto match;
  252. }
  253. /* find potential match */
  254. HASH_FUNCTION(hval, ip, hash_log);
  255. ref = ibase + htab[hval];
  256. /* calculate distance to the match */
  257. distance = (int32_t)(anchor - ref);
  258. /* update hash table if necessary */
  259. /* not exactly sure why 0x1F works best, but experiments apparently say so */
  260. if ((distance & 0x1F) == 0)
  261. htab[hval] = (uint16_t)(anchor - ibase);
  262. /* is this a match? check the first 3 bytes */
  263. if (distance == 0 || (distance >= MAX_FARDISTANCE) ||
  264. *ref++ != *ip++ || *ref++ != *ip++ || *ref++ != *ip++) {
  265. LITERAL(ip, op, op_limit, anchor, copy);
  266. }
  267. /* far, needs at least 5-byte match */
  268. if (opt_level >= 5 && distance >= MAX_DISTANCE) {
  269. if (*ip++ != *ref++ || *ip++ != *ref++) LITERAL(ip, op, op_limit, anchor, copy);
  270. len += 2;
  271. }
  272. match:
  273. /* last matched byte */
  274. ip = anchor + len;
  275. /* distance is biased */
  276. distance--;
  277. if (!distance) {
  278. /* zero distance means a run */
  279. #if defined(__AVX2__)
  280. ip = get_run_32(ip, ip_bound, ref);
  281. #elif defined(__SSE2__)
  282. ip = get_run_16(ip, ip_bound, ref);
  283. #else
  284. ip = get_run(ip, ip_bound, ref);
  285. #endif
  286. }
  287. else {
  288. #if defined(__AVX2__)
  289. /* Experiments show that the SSE2 version is a bit faster, even on AVX2 processors */
  290. ip = get_match_16(ip, ip_bound + IP_BOUNDARY, ref);
  291. #elif defined(__SSE2__)
  292. ip = get_match_16(ip, ip_bound + IP_BOUNDARY, ref);
  293. #else
  294. ip = get_match(ip, ip_bound + IP_BOUNDARY, ref);
  295. #endif
  296. }
  297. /* if we have copied something, adjust the copy count */
  298. if (copy)
  299. /* copy is biased, '0' means 1 byte copy */
  300. *(op - copy - 1) = copy - 1;
  301. else
  302. /* back, to overwrite the copy count */
  303. op--;
  304. /* reset literal counter */
  305. copy = 0;
  306. /* length is biased, '1' means a match of 3 bytes */
  307. ip -= 3;
  308. len = (int32_t)(ip - anchor);
  309. /* check that we have space enough to encode the match for all the cases */
  310. if (BLOSCLZ_UNEXPECT_CONDITIONAL(op + (len / 255) + 6 > op_limit)) goto out;
  311. /* encode the match */
  312. if (distance < MAX_DISTANCE) {
  313. if (len < 7) {
  314. *op++ = (len << 5) + (distance >> 8);
  315. *op++ = (distance & 255);
  316. }
  317. else {
  318. *op++ = (uint8_t)((7 << 5) + (distance >> 8));
  319. for (len -= 7; len >= 255; len -= 255)
  320. *op++ = 255;
  321. *op++ = len;
  322. *op++ = (distance & 255);
  323. }
  324. }
  325. else {
  326. /* far away, but not yet in the another galaxy... */
  327. if (len < 7) {
  328. distance -= MAX_DISTANCE;
  329. *op++ = (uint8_t)((len << 5) + 31);
  330. *op++ = 255;
  331. *op++ = (uint8_t)(distance >> 8);
  332. *op++ = distance & 255;
  333. }
  334. else {
  335. distance -= MAX_DISTANCE;
  336. *op++ = (7 << 5) + 31;
  337. for (len -= 7; len >= 255; len -= 255)
  338. *op++ = 255;
  339. *op++ = len;
  340. *op++ = 255;
  341. *op++ = (uint8_t)(distance >> 8);
  342. *op++ = distance & 255;
  343. }
  344. }
  345. /* update the hash at match boundary */
  346. HASH_FUNCTION(hval, ip, hash_log);
  347. htab[hval] = (uint16_t)(ip++ - ibase);
  348. HASH_FUNCTION(hval, ip, hash_log);
  349. htab[hval] = (uint16_t)(ip++ - ibase);
  350. /* assuming literal copy */
  351. *op++ = MAX_COPY - 1;
  352. }
  353. /* left-over as literal copy */
  354. ip_bound++;
  355. while (ip <= ip_bound) {
  356. if (BLOSCLZ_UNEXPECT_CONDITIONAL(op + 2 > op_limit)) goto out;
  357. *op++ = *ip++;
  358. copy++;
  359. if (copy == MAX_COPY) {
  360. copy = 0;
  361. *op++ = MAX_COPY - 1;
  362. }
  363. }
  364. /* if we have copied something, adjust the copy length */
  365. if (copy)
  366. *(op - copy - 1) = copy - 1;
  367. else
  368. op--;
  369. /* marker for blosclz */
  370. *(uint8_t*)output |= (1 << 5);
  371. free(htab);
  372. return (int)(op - (uint8_t*)output);
  373. out:
  374. free(htab);
  375. return 0;
  376. }
  377. int blosclz_decompress(const void* input, int length, void* output, int maxout) {
  378. const uint8_t* ip = (const uint8_t*)input;
  379. const uint8_t* ip_limit = ip + length;
  380. uint8_t* op = (uint8_t*)output;
  381. int32_t ctrl = (*ip++) & 31;
  382. int32_t loop = 1;
  383. #ifdef BLOSCLZ_SAFE
  384. uint8_t* op_limit = op + maxout;
  385. #endif
  386. do {
  387. uint8_t* ref = op;
  388. int32_t len = ctrl >> 5;
  389. int32_t ofs = (ctrl & 31) << 8;
  390. if (ctrl >= 32) {
  391. uint8_t code;
  392. len--;
  393. ref -= ofs;
  394. if (len == 7 - 1)
  395. do {
  396. code = *ip++;
  397. len += code;
  398. } while (code == 255);
  399. code = *ip++;
  400. ref -= code;
  401. /* match from 16-bit distance */
  402. if (BLOSCLZ_UNEXPECT_CONDITIONAL(code == 255)) if (BLOSCLZ_EXPECT_CONDITIONAL(ofs == (31 << 8))) {
  403. ofs = (*ip++) << 8;
  404. ofs += *ip++;
  405. ref = op - ofs - MAX_DISTANCE;
  406. }
  407. #ifdef BLOSCLZ_SAFE
  408. if (BLOSCLZ_UNEXPECT_CONDITIONAL(op + len + 3 > op_limit)) {
  409. return 0;
  410. }
  411. if (BLOSCLZ_UNEXPECT_CONDITIONAL(ref - 1 < (uint8_t*)output)) {
  412. return 0;
  413. }
  414. #endif
  415. if (BLOSCLZ_EXPECT_CONDITIONAL(ip < ip_limit))
  416. ctrl = *ip++;
  417. else
  418. loop = 0;
  419. if (ref == op) {
  420. /* optimized copy for a run */
  421. uint8_t b = ref[-1];
  422. memset(op, b, len + 3);
  423. op += len + 3;
  424. }
  425. else {
  426. /* copy from reference */
  427. ref--;
  428. len += 3;
  429. op = safecopy(op, ref, (unsigned) len);
  430. }
  431. }
  432. else {
  433. ctrl++;
  434. #ifdef BLOSCLZ_SAFE
  435. if (BLOSCLZ_UNEXPECT_CONDITIONAL(op + ctrl > op_limit)) {
  436. return 0;
  437. }
  438. if (BLOSCLZ_UNEXPECT_CONDITIONAL(ip + ctrl > ip_limit)) {
  439. return 0;
  440. }
  441. #endif
  442. // memcpy(op, ip, ctrl); op += ctrl; ip += ctrl;
  443. // On GCC-6, fastcopy this is still faster than plain memcpy
  444. // However, using recent CLANG/LLVM 9.0, there is almost no difference
  445. // in performance.
  446. op = fastcopy(op, ip, (unsigned) ctrl);
  447. ip += ctrl;
  448. loop = (int32_t)BLOSCLZ_EXPECT_CONDITIONAL(ip < ip_limit);
  449. if (loop)
  450. ctrl = *ip++;
  451. }
  452. } while (BLOSCLZ_EXPECT_CONDITIONAL(loop));
  453. return (int)(op - (uint8_t*)output);
  454. }