Don't try to delta if target is much smaller than source
[git/git.git] / diff-delta.c
CommitLineData
a310d434
NP
1/*
2 * diff-delta.c: generate a delta between two buffers
3 *
366b53c1
NP
4 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5 * http://www.xmailserver.org/xdiff-lib.html
a310d434 6 *
366b53c1 7 * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
a310d434 8 *
366b53c1
NP
9 * This code is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
a310d434
NP
12 */
13
63f17569 14#include "git-compat-util.h"
85023577 15#include "delta.h"
a310d434 16
08abe669
NP
17/* maximum hash entry list for the same hash bucket */
18#define HASH_LIMIT 64
a310d434 19
3dc5a9e4
NP
20#define RABIN_SHIFT 23
21#define RABIN_WINDOW 16
22
23static const unsigned int T[256] = {
24 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
25 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
26 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
27 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
28 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
29 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
30 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
31 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
32 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
33 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
34 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
35 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
36 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
37 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
38 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
39 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
40 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
41 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
42 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
43 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
44 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
45 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
46 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
47 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
48 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
49 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
50 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
51 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
52 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
53 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
54 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
55 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
56 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
57 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
58 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
59 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
60 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
61 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
62 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
63 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
64 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
65 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
66 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
67};
68
69static const unsigned int U[256] = {
70 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
71 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
72 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
73 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
74 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
75 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
76 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
77 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
78 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
79 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
80 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
81 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
82 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
83 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
84 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
85 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
86 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
87 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
88 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
89 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
90 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
91 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
92 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
93 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
94 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
95 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
96 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
97 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
98 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
99 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
100 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
101 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
102 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
103 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
104 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
105 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
106 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
107 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
108 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
109 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
110 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
111 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
112 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
113};
a310d434 114
08abe669 115struct index_entry {
a310d434 116 const unsigned char *ptr;
8e1454b5 117 unsigned int val;
08abe669 118 struct index_entry *next;
8e1454b5 119};
a310d434 120
08abe669
NP
121struct delta_index {
122 const void *src_buf;
123 unsigned long src_size;
3dc5a9e4 124 unsigned int hash_mask;
63f17569 125 struct index_entry *hash[FLEX_ARRAY];
08abe669
NP
126};
127
128struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
a310d434 129{
06a9f920 130 unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
08abe669
NP
131 const unsigned char *data, *buffer = buf;
132 struct delta_index *index;
133 struct index_entry *entry, **hash;
8e1454b5 134 void *mem;
06a9f920 135 unsigned long memsize;
8e1454b5 136
08abe669
NP
137 if (!buf || !bufsize)
138 return NULL;
139
3dc5a9e4 140 /* Determine index hash size. Note that indexing skips the
82e5a82f 141 first byte to allow for optimizing the Rabin's polynomial
3dc5a9e4
NP
142 initialization in create_delta(). */
143 entries = (bufsize - 1) / RABIN_WINDOW;
8e1454b5 144 hsize = entries / 4;
b05faa2d 145 for (i = 4; (1u << i) < hsize && i < 31; i++);
8e1454b5 146 hsize = 1 << i;
3dc5a9e4 147 hmask = hsize - 1;
8e1454b5
NP
148
149 /* allocate lookup index */
06a9f920
NP
150 memsize = sizeof(*index) +
151 sizeof(*hash) * hsize +
152 sizeof(*entry) * entries;
153 mem = malloc(memsize);
8e1454b5
NP
154 if (!mem)
155 return NULL;
08abe669
NP
156 index = mem;
157 mem = index + 1;
8e1454b5 158 hash = mem;
08abe669
NP
159 mem = hash + hsize;
160 entry = mem;
161
162 index->src_buf = buf;
163 index->src_size = bufsize;
3dc5a9e4 164 index->hash_mask = hmask;
8e1454b5
NP
165 memset(hash, 0, hsize * sizeof(*hash));
166
c13c6bf7
NP
167 /* allocate an array to count hash entries */
168 hash_count = calloc(hsize, sizeof(*hash_count));
169 if (!hash_count) {
08abe669 170 free(index);
c13c6bf7
NP
171 return NULL;
172 }
173
174 /* then populate the index */
06a9f920
NP
175 prev_val = ~0;
176 for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
177 data >= buffer;
178 data -= RABIN_WINDOW) {
3dc5a9e4
NP
179 unsigned int val = 0;
180 for (i = 1; i <= RABIN_WINDOW; i++)
181 val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
06a9f920
NP
182 if (val == prev_val) {
183 /* keep the lowest of consecutive identical blocks */
184 entry[-1].ptr = data + RABIN_WINDOW;
185 } else {
186 prev_val = val;
187 i = val & hmask;
188 entry->ptr = data + RABIN_WINDOW;
189 entry->val = val;
190 entry->next = hash[i];
191 hash[i] = entry++;
192 hash_count[i]++;
06a9f920 193 }
3dc5a9e4 194 }
a310d434 195
c13c6bf7
NP
196 /*
197 * Determine a limit on the number of entries in the same hash
82e5a82f 198 * bucket. This guards us against pathological data sets causing
c13c6bf7
NP
199 * really bad hash distribution with most entries in the same hash
200 * bucket that would bring us to O(m*n) computing costs (m and n
201 * corresponding to reference and target buffer sizes).
202 *
08abe669 203 * Make sure none of the hash buckets has more entries than
c13c6bf7
NP
204 * we're willing to test. Otherwise we cull the entry list
205 * uniformly to still preserve a good repartition across
206 * the reference buffer.
207 */
208 for (i = 0; i < hsize; i++) {
08abe669 209 if (hash_count[i] < HASH_LIMIT)
c13c6bf7
NP
210 continue;
211 entry = hash[i];
212 do {
08abe669
NP
213 struct index_entry *keep = entry;
214 int skip = hash_count[i] / HASH_LIMIT / 2;
c13c6bf7
NP
215 do {
216 entry = entry->next;
217 } while(--skip && entry);
218 keep->next = entry;
219 } while(entry);
220 }
221 free(hash_count);
222
08abe669
NP
223 return index;
224}
225
226void free_delta_index(struct delta_index *index)
227{
228 free(index);
a310d434
NP
229}
230
3dc5a9e4
NP
231/*
232 * The maximum size for any opcode sequence, including the initial header
82e5a82f 233 * plus Rabin window plus biggest copy.
3dc5a9e4
NP
234 */
235#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
fe474b58 236
08abe669
NP
237void *
238create_delta(const struct delta_index *index,
239 const void *trg_buf, unsigned long trg_size,
240 unsigned long *delta_size, unsigned long max_size)
a310d434 241{
84336696 242 unsigned int i, outpos, outsize, moff, msize, val;
5a1fb2ca 243 int inscnt;
8e1454b5
NP
244 const unsigned char *ref_data, *ref_top, *data, *top;
245 unsigned char *out;
a310d434 246
08abe669 247 if (!trg_buf || !trg_size)
8e1454b5
NP
248 return NULL;
249
a310d434
NP
250 outpos = 0;
251 outsize = 8192;
fe474b58
NP
252 if (max_size && outsize >= max_size)
253 outsize = max_size + MAX_OP_SIZE + 1;
a310d434 254 out = malloc(outsize);
08abe669 255 if (!out)
a310d434 256 return NULL;
a310d434
NP
257
258 /* store reference buffer size */
08abe669
NP
259 i = index->src_size;
260 while (i >= 0x80) {
261 out[outpos++] = i | 0x80;
262 i >>= 7;
69a2d426 263 }
08abe669 264 out[outpos++] = i;
a310d434
NP
265
266 /* store target buffer size */
08abe669
NP
267 i = trg_size;
268 while (i >= 0x80) {
269 out[outpos++] = i | 0x80;
270 i >>= 7;
69a2d426 271 }
08abe669 272 out[outpos++] = i;
a310d434 273
08abe669
NP
274 ref_data = index->src_buf;
275 ref_top = ref_data + index->src_size;
276 data = trg_buf;
1d7f171c 277 top = (const unsigned char *) trg_buf + trg_size;
3dc5a9e4
NP
278
279 outpos++;
280 val = 0;
281 for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
282 out[outpos++] = *data;
283 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
284 }
285 inscnt = i;
a310d434 286
84336696
NP
287 moff = 0;
288 msize = 0;
8e1454b5 289 while (data < top) {
84336696
NP
290 if (msize < 4096) {
291 struct index_entry *entry;
292 val ^= U[data[-RABIN_WINDOW]];
293 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
294 i = val & index->hash_mask;
295 for (entry = index->hash[i]; entry; entry = entry->next) {
296 const unsigned char *ref = entry->ptr;
297 const unsigned char *src = data;
298 unsigned int ref_size = ref_top - ref;
299 if (entry->val != val)
300 continue;
301 if (ref_size > top - src)
302 ref_size = top - src;
303 if (ref_size <= msize)
304 break;
305 while (ref_size-- && *src++ == *ref)
306 ref++;
307 if (msize < ref - entry->ptr) {
308 /* this is our best match so far */
309 msize = ref - entry->ptr;
310 moff = entry->ptr - ref_data;
311 if (msize >= 4096) /* good enough */
312 break;
313 }
a310d434
NP
314 }
315 }
316
3dc5a9e4 317 if (msize < 4) {
a310d434
NP
318 if (!inscnt)
319 outpos++;
320 out[outpos++] = *data++;
321 inscnt++;
322 if (inscnt == 0x7f) {
323 out[outpos - inscnt - 1] = inscnt;
324 inscnt = 0;
325 }
84336696 326 msize = 0;
a310d434 327 } else {
84336696 328 unsigned int left;
8e1454b5
NP
329 unsigned char *op;
330
a310d434 331 if (inscnt) {
5a1fb2ca 332 while (moff && ref_data[moff-1] == data[-1]) {
5a1fb2ca
NP
333 /* we can match one byte back */
334 msize++;
335 moff--;
336 data--;
337 outpos--;
338 if (--inscnt)
339 continue;
340 outpos--; /* remove count slot */
341 inscnt--; /* make it -1 */
342 break;
343 }
a310d434
NP
344 out[outpos - inscnt - 1] = inscnt;
345 inscnt = 0;
346 }
347
84336696
NP
348 /* A copy op is currently limited to 64KB (pack v2) */
349 left = (msize < 0x10000) ? 0 : (msize - 0x10000);
350 msize -= left;
351
8e1454b5 352 op = out + outpos++;
a310d434
NP
353 i = 0x80;
354
84336696
NP
355 if (moff & 0x000000ff)
356 out[outpos++] = moff >> 0, i |= 0x01;
357 if (moff & 0x0000ff00)
358 out[outpos++] = moff >> 8, i |= 0x02;
359 if (moff & 0x00ff0000)
360 out[outpos++] = moff >> 16, i |= 0x04;
361 if (moff & 0xff000000)
362 out[outpos++] = moff >> 24, i |= 0x08;
a310d434 363
84336696
NP
364 if (msize & 0x00ff)
365 out[outpos++] = msize >> 0, i |= 0x10;
366 if (msize & 0xff00)
367 out[outpos++] = msize >> 8, i |= 0x20;
a310d434 368
8e1454b5 369 *op = i;
84336696
NP
370
371 data += msize;
372 moff += msize;
373 msize = left;
374
375 if (msize < 4096) {
376 int j;
377 val = 0;
378 for (j = -RABIN_WINDOW; j < 0; j++)
379 val = ((val << 8) | data[j])
380 ^ T[val >> RABIN_SHIFT];
381 }
a310d434
NP
382 }
383
fe474b58 384 if (outpos >= outsize - MAX_OP_SIZE) {
a310d434
NP
385 void *tmp = out;
386 outsize = outsize * 3 / 2;
fe474b58
NP
387 if (max_size && outsize >= max_size)
388 outsize = max_size + MAX_OP_SIZE + 1;
389 if (max_size && outpos > max_size)
3dc5a9e4 390 break;
b75c6c6d 391 out = realloc(out, outsize);
a310d434
NP
392 if (!out) {
393 free(tmp);
a310d434
NP
394 return NULL;
395 }
396 }
397 }
398
399 if (inscnt)
400 out[outpos - inscnt - 1] = inscnt;
401
3dc5a9e4
NP
402 if (max_size && outpos > max_size) {
403 free(out);
404 return NULL;
405 }
406
a310d434
NP
407 *delta_size = outpos;
408 return out;
409}