Rewrite convert_to_{git,working_tree} to use strbuf's.
[git/git.git] / convert.c
1 #include "cache.h"
2 #include "attr.h"
3 #include "run-command.h"
4 #include "strbuf.h"
5
6 /*
7 * convert.c - convert a file when checking it out and checking it in.
8 *
9 * This should use the pathname to decide on whether it wants to do some
10 * more interesting conversions (automatic gzip/unzip, general format
11 * conversions etc etc), but by default it just does automatic CRLF<->LF
12 * translation when the "auto_crlf" option is set.
13 */
14
15 #define CRLF_GUESS (-1)
16 #define CRLF_BINARY 0
17 #define CRLF_TEXT 1
18 #define CRLF_INPUT 2
19
20 struct text_stat {
21 /* CR, LF and CRLF counts */
22 unsigned cr, lf, crlf;
23
24 /* These are just approximations! */
25 unsigned printable, nonprintable;
26 };
27
28 static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
29 {
30 unsigned long i;
31
32 memset(stats, 0, sizeof(*stats));
33
34 for (i = 0; i < size; i++) {
35 unsigned char c = buf[i];
36 if (c == '\r') {
37 stats->cr++;
38 if (i+1 < size && buf[i+1] == '\n')
39 stats->crlf++;
40 continue;
41 }
42 if (c == '\n') {
43 stats->lf++;
44 continue;
45 }
46 if (c == 127)
47 /* DEL */
48 stats->nonprintable++;
49 else if (c < 32) {
50 switch (c) {
51 /* BS, HT, ESC and FF */
52 case '\b': case '\t': case '\033': case '\014':
53 stats->printable++;
54 break;
55 default:
56 stats->nonprintable++;
57 }
58 }
59 else
60 stats->printable++;
61 }
62 }
63
64 /*
65 * The same heuristics as diff.c::mmfile_is_binary()
66 */
67 static int is_binary(unsigned long size, struct text_stat *stats)
68 {
69
70 if ((stats->printable >> 7) < stats->nonprintable)
71 return 1;
72 /*
73 * Other heuristics? Average line length might be relevant,
74 * as might LF vs CR vs CRLF counts..
75 *
76 * NOTE! It might be normal to have a low ratio of CRLF to LF
77 * (somebody starts with a LF-only file and edits it with an editor
78 * that adds CRLF only to lines that are added..). But do we
79 * want to support CR-only? Probably not.
80 */
81 return 0;
82 }
83
84 static int crlf_to_git(const char *path, const char *src, size_t len,
85 struct strbuf *buf, int action)
86 {
87 struct text_stat stats;
88 char *dst;
89
90 if ((action == CRLF_BINARY) || !auto_crlf || !len)
91 return 0;
92
93 gather_stats(src, len, &stats);
94 /* No CR? Nothing to convert, regardless. */
95 if (!stats.cr)
96 return 0;
97
98 if (action == CRLF_GUESS) {
99 /*
100 * We're currently not going to even try to convert stuff
101 * that has bare CR characters. Does anybody do that crazy
102 * stuff?
103 */
104 if (stats.cr != stats.crlf)
105 return 0;
106
107 /*
108 * And add some heuristics for binary vs text, of course...
109 */
110 if (is_binary(len, &stats))
111 return 0;
112 }
113
114 strbuf_grow(buf, len);
115 dst = buf->buf;
116 if (action == CRLF_GUESS) {
117 /*
118 * If we guessed, we already know we rejected a file with
119 * lone CR, and we can strip a CR without looking at what
120 * follow it.
121 */
122 do {
123 unsigned char c = *src++;
124 if (c != '\r')
125 *dst++ = c;
126 } while (--len);
127 } else {
128 do {
129 unsigned char c = *src++;
130 if (! (c == '\r' && (1 < len && *src == '\n')))
131 *dst++ = c;
132 } while (--len);
133 }
134 strbuf_setlen(buf, dst - buf->buf);
135 return 1;
136 }
137
138 static int crlf_to_worktree(const char *path, const char *src, size_t len,
139 struct strbuf *buf, int action)
140 {
141 char *to_free = NULL;
142 struct text_stat stats;
143
144 if ((action == CRLF_BINARY) || (action == CRLF_INPUT) ||
145 auto_crlf <= 0)
146 return 0;
147
148 if (!len)
149 return 0;
150
151 gather_stats(src, len, &stats);
152
153 /* No LF? Nothing to convert, regardless. */
154 if (!stats.lf)
155 return 0;
156
157 /* Was it already in CRLF format? */
158 if (stats.lf == stats.crlf)
159 return 0;
160
161 if (action == CRLF_GUESS) {
162 /* If we have any bare CR characters, we're not going to touch it */
163 if (stats.cr != stats.crlf)
164 return 0;
165
166 if (is_binary(len, &stats))
167 return 0;
168 }
169
170 /* are we "faking" in place editing ? */
171 if (src == buf->buf)
172 to_free = strbuf_detach(buf);
173
174 strbuf_grow(buf, len + stats.lf - stats.crlf);
175 for (;;) {
176 const char *nl = memchr(src, '\n', len);
177 if (!nl)
178 break;
179 if (nl > src && nl[-1] == '\r') {
180 strbuf_add(buf, src, nl + 1 - src);
181 } else {
182 strbuf_add(buf, src, nl - src);
183 strbuf_addstr(buf, "\r\n");
184 }
185 len -= nl + 1 - src;
186 src = nl + 1;
187 }
188 strbuf_add(buf, src, len);
189
190 free(to_free);
191 return 1;
192 }
193
194 static int filter_buffer(const char *path, const char *src,
195 unsigned long size, const char *cmd)
196 {
197 /*
198 * Spawn cmd and feed the buffer contents through its stdin.
199 */
200 struct child_process child_process;
201 int pipe_feed[2];
202 int write_err, status;
203
204 memset(&child_process, 0, sizeof(child_process));
205
206 if (pipe(pipe_feed) < 0) {
207 error("cannot create pipe to run external filter %s", cmd);
208 return 1;
209 }
210
211 child_process.pid = fork();
212 if (child_process.pid < 0) {
213 error("cannot fork to run external filter %s", cmd);
214 close(pipe_feed[0]);
215 close(pipe_feed[1]);
216 return 1;
217 }
218 if (!child_process.pid) {
219 dup2(pipe_feed[0], 0);
220 close(pipe_feed[0]);
221 close(pipe_feed[1]);
222 execlp("sh", "sh", "-c", cmd, NULL);
223 return 1;
224 }
225 close(pipe_feed[0]);
226
227 write_err = (write_in_full(pipe_feed[1], src, size) < 0);
228 if (close(pipe_feed[1]))
229 write_err = 1;
230 if (write_err)
231 error("cannot feed the input to external filter %s", cmd);
232
233 status = finish_command(&child_process);
234 if (status)
235 error("external filter %s failed %d", cmd, -status);
236 return (write_err || status);
237 }
238
239 static int apply_filter(const char *path, const char *src, size_t len,
240 struct strbuf *dst, const char *cmd)
241 {
242 /*
243 * Create a pipeline to have the command filter the buffer's
244 * contents.
245 *
246 * (child --> cmd) --> us
247 */
248 int pipe_feed[2];
249 int status, ret = 1;
250 struct child_process child_process;
251 struct strbuf nbuf;
252
253 if (!cmd)
254 return 0;
255
256 memset(&child_process, 0, sizeof(child_process));
257
258 if (pipe(pipe_feed) < 0) {
259 error("cannot create pipe to run external filter %s", cmd);
260 return 0;
261 }
262
263 fflush(NULL);
264 child_process.pid = fork();
265 if (child_process.pid < 0) {
266 error("cannot fork to run external filter %s", cmd);
267 close(pipe_feed[0]);
268 close(pipe_feed[1]);
269 return 0;
270 }
271 if (!child_process.pid) {
272 dup2(pipe_feed[1], 1);
273 close(pipe_feed[0]);
274 close(pipe_feed[1]);
275 exit(filter_buffer(path, src, len, cmd));
276 }
277 close(pipe_feed[1]);
278
279 strbuf_init(&nbuf, 0);
280 if (strbuf_read(&nbuf, pipe_feed[0], len) < 0) {
281 error("read from external filter %s failed", cmd);
282 ret = 0;
283 }
284 if (close(pipe_feed[0])) {
285 ret = error("read from external filter %s failed", cmd);
286 ret = 0;
287 }
288 status = finish_command(&child_process);
289 if (status) {
290 ret = error("external filter %s failed %d", cmd, -status);
291 ret = 0;
292 }
293
294 if (ret) {
295 *dst = nbuf;
296 } else {
297 strbuf_release(&nbuf);
298 }
299 return ret;
300 }
301
302 static struct convert_driver {
303 const char *name;
304 struct convert_driver *next;
305 char *smudge;
306 char *clean;
307 } *user_convert, **user_convert_tail;
308
309 static int read_convert_config(const char *var, const char *value)
310 {
311 const char *ep, *name;
312 int namelen;
313 struct convert_driver *drv;
314
315 /*
316 * External conversion drivers are configured using
317 * "filter.<name>.variable".
318 */
319 if (prefixcmp(var, "filter.") || (ep = strrchr(var, '.')) == var + 6)
320 return 0;
321 name = var + 7;
322 namelen = ep - name;
323 for (drv = user_convert; drv; drv = drv->next)
324 if (!strncmp(drv->name, name, namelen) && !drv->name[namelen])
325 break;
326 if (!drv) {
327 char *namebuf;
328 drv = xcalloc(1, sizeof(struct convert_driver));
329 namebuf = xmalloc(namelen + 1);
330 memcpy(namebuf, name, namelen);
331 namebuf[namelen] = 0;
332 drv->name = namebuf;
333 drv->next = NULL;
334 *user_convert_tail = drv;
335 user_convert_tail = &(drv->next);
336 }
337
338 ep++;
339
340 /*
341 * filter.<name>.smudge and filter.<name>.clean specifies
342 * the command line:
343 *
344 * command-line
345 *
346 * The command-line will not be interpolated in any way.
347 */
348
349 if (!strcmp("smudge", ep)) {
350 if (!value)
351 return error("%s: lacks value", var);
352 drv->smudge = strdup(value);
353 return 0;
354 }
355
356 if (!strcmp("clean", ep)) {
357 if (!value)
358 return error("%s: lacks value", var);
359 drv->clean = strdup(value);
360 return 0;
361 }
362 return 0;
363 }
364
365 static void setup_convert_check(struct git_attr_check *check)
366 {
367 static struct git_attr *attr_crlf;
368 static struct git_attr *attr_ident;
369 static struct git_attr *attr_filter;
370
371 if (!attr_crlf) {
372 attr_crlf = git_attr("crlf", 4);
373 attr_ident = git_attr("ident", 5);
374 attr_filter = git_attr("filter", 6);
375 user_convert_tail = &user_convert;
376 git_config(read_convert_config);
377 }
378 check[0].attr = attr_crlf;
379 check[1].attr = attr_ident;
380 check[2].attr = attr_filter;
381 }
382
383 static int count_ident(const char *cp, unsigned long size)
384 {
385 /*
386 * "$Id: 0000000000000000000000000000000000000000 $" <=> "$Id$"
387 */
388 int cnt = 0;
389 char ch;
390
391 while (size) {
392 ch = *cp++;
393 size--;
394 if (ch != '$')
395 continue;
396 if (size < 3)
397 break;
398 if (memcmp("Id", cp, 2))
399 continue;
400 ch = cp[2];
401 cp += 3;
402 size -= 3;
403 if (ch == '$')
404 cnt++; /* $Id$ */
405 if (ch != ':')
406 continue;
407
408 /*
409 * "$Id: ... "; scan up to the closing dollar sign and discard.
410 */
411 while (size) {
412 ch = *cp++;
413 size--;
414 if (ch == '$') {
415 cnt++;
416 break;
417 }
418 }
419 }
420 return cnt;
421 }
422
423 static int ident_to_git(const char *path, const char *src, size_t len,
424 struct strbuf *buf, int ident)
425 {
426 char *dst, *dollar;
427
428 if (!ident || !count_ident(src, len))
429 return 0;
430
431 strbuf_grow(buf, len);
432 dst = buf->buf;
433 for (;;) {
434 dollar = memchr(src, '$', len);
435 if (!dollar)
436 break;
437 memcpy(dst, src, dollar + 1 - src);
438 dst += dollar + 1 - src;
439 len -= dollar + 1 - src;
440 src = dollar + 1;
441
442 if (len > 3 && !memcmp(src, "Id:", 3)) {
443 dollar = memchr(src + 3, '$', len - 3);
444 if (!dollar)
445 break;
446 memcpy(dst, "Id$", 3);
447 dst += 3;
448 len -= dollar + 1 - src;
449 src = dollar + 1;
450 }
451 }
452 memcpy(dst, src, len);
453 strbuf_setlen(buf, dst + len - buf->buf);
454 return 1;
455 }
456
457 static int ident_to_worktree(const char *path, const char *src, size_t len,
458 struct strbuf *buf, int ident)
459 {
460 unsigned char sha1[20];
461 char *to_free = NULL, *dollar;
462 int cnt;
463
464 if (!ident)
465 return 0;
466
467 cnt = count_ident(src, len);
468 if (!cnt)
469 return 0;
470
471 /* are we "faking" in place editing ? */
472 if (src == buf->buf)
473 to_free = strbuf_detach(buf);
474 hash_sha1_file(src, len, "blob", sha1);
475
476 strbuf_grow(buf, len + cnt * 43);
477 for (;;) {
478 /* step 1: run to the next '$' */
479 dollar = memchr(src, '$', len);
480 if (!dollar)
481 break;
482 strbuf_add(buf, src, dollar + 1 - src);
483 len -= dollar + 1 - src;
484 src = dollar + 1;
485
486 /* step 2: does it looks like a bit like Id:xxx$ or Id$ ? */
487 if (len < 3 || memcmp("Id", src, 2))
488 continue;
489
490 /* step 3: skip over Id$ or Id:xxxxx$ */
491 if (src[2] == '$') {
492 src += 3;
493 len -= 3;
494 } else if (src[2] == ':') {
495 /*
496 * It's possible that an expanded Id has crept its way into the
497 * repository, we cope with that by stripping the expansion out
498 */
499 dollar = memchr(src + 3, '$', len - 3);
500 if (!dollar) {
501 /* incomplete keyword, no more '$', so just quit the loop */
502 break;
503 }
504
505 len -= dollar + 1 - src;
506 src = dollar + 1;
507 } else {
508 /* it wasn't a "Id$" or "Id:xxxx$" */
509 continue;
510 }
511
512 /* step 4: substitute */
513 strbuf_addstr(buf, "Id: ");
514 strbuf_add(buf, sha1_to_hex(sha1), 40);
515 strbuf_addstr(buf, " $");
516 }
517 strbuf_add(buf, src, len);
518
519 free(to_free);
520 return 1;
521 }
522
523 static int git_path_check_crlf(const char *path, struct git_attr_check *check)
524 {
525 const char *value = check->value;
526
527 if (ATTR_TRUE(value))
528 return CRLF_TEXT;
529 else if (ATTR_FALSE(value))
530 return CRLF_BINARY;
531 else if (ATTR_UNSET(value))
532 ;
533 else if (!strcmp(value, "input"))
534 return CRLF_INPUT;
535 return CRLF_GUESS;
536 }
537
538 static struct convert_driver *git_path_check_convert(const char *path,
539 struct git_attr_check *check)
540 {
541 const char *value = check->value;
542 struct convert_driver *drv;
543
544 if (ATTR_TRUE(value) || ATTR_FALSE(value) || ATTR_UNSET(value))
545 return NULL;
546 for (drv = user_convert; drv; drv = drv->next)
547 if (!strcmp(value, drv->name))
548 return drv;
549 return NULL;
550 }
551
552 static int git_path_check_ident(const char *path, struct git_attr_check *check)
553 {
554 const char *value = check->value;
555
556 return !!ATTR_TRUE(value);
557 }
558
559 int convert_to_git(const char *path, const char *src, size_t len, struct strbuf *dst)
560 {
561 struct git_attr_check check[3];
562 int crlf = CRLF_GUESS;
563 int ident = 0, ret = 0;
564 char *filter = NULL;
565
566 setup_convert_check(check);
567 if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
568 struct convert_driver *drv;
569 crlf = git_path_check_crlf(path, check + 0);
570 ident = git_path_check_ident(path, check + 1);
571 drv = git_path_check_convert(path, check + 2);
572 if (drv && drv->clean)
573 filter = drv->clean;
574 }
575
576 ret |= apply_filter(path, src, len, dst, filter);
577 if (ret) {
578 src = dst->buf;
579 len = dst->len;
580 }
581 ret |= crlf_to_git(path, src, len, dst, crlf);
582 if (ret) {
583 src = dst->buf;
584 len = dst->len;
585 }
586 return ret | ident_to_git(path, src, len, dst, ident);
587 }
588
589 int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst)
590 {
591 struct git_attr_check check[3];
592 int crlf = CRLF_GUESS;
593 int ident = 0, ret = 0;
594 char *filter = NULL;
595
596 setup_convert_check(check);
597 if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
598 struct convert_driver *drv;
599 crlf = git_path_check_crlf(path, check + 0);
600 ident = git_path_check_ident(path, check + 1);
601 drv = git_path_check_convert(path, check + 2);
602 if (drv && drv->smudge)
603 filter = drv->smudge;
604 }
605
606 ret |= ident_to_worktree(path, src, len, dst, ident);
607 if (ret) {
608 src = dst->buf;
609 len = dst->len;
610 }
611 ret |= crlf_to_worktree(path, src, len, dst, crlf);
612 if (ret) {
613 src = dst->buf;
614 len = dst->len;
615 }
616 return ret | apply_filter(path, src, len, dst, filter);
617 }