| 1 | #include "cache.h" |
| 2 | #include "attr.h" |
| 3 | |
| 4 | /* |
| 5 | * convert.c - convert a file when checking it out and checking it in. |
| 6 | * |
| 7 | * This should use the pathname to decide on whether it wants to do some |
| 8 | * more interesting conversions (automatic gzip/unzip, general format |
| 9 | * conversions etc etc), but by default it just does automatic CRLF<->LF |
| 10 | * translation when the "auto_crlf" option is set. |
| 11 | */ |
| 12 | |
| 13 | #define CRLF_GUESS (-1) |
| 14 | #define CRLF_BINARY 0 |
| 15 | #define CRLF_TEXT 1 |
| 16 | #define CRLF_INPUT 2 |
| 17 | |
| 18 | struct text_stat { |
| 19 | /* CR, LF and CRLF counts */ |
| 20 | unsigned cr, lf, crlf; |
| 21 | |
| 22 | /* These are just approximations! */ |
| 23 | unsigned printable, nonprintable; |
| 24 | }; |
| 25 | |
| 26 | static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats) |
| 27 | { |
| 28 | unsigned long i; |
| 29 | |
| 30 | memset(stats, 0, sizeof(*stats)); |
| 31 | |
| 32 | for (i = 0; i < size; i++) { |
| 33 | unsigned char c = buf[i]; |
| 34 | if (c == '\r') { |
| 35 | stats->cr++; |
| 36 | if (i+1 < size && buf[i+1] == '\n') |
| 37 | stats->crlf++; |
| 38 | continue; |
| 39 | } |
| 40 | if (c == '\n') { |
| 41 | stats->lf++; |
| 42 | continue; |
| 43 | } |
| 44 | if (c == 127) |
| 45 | /* DEL */ |
| 46 | stats->nonprintable++; |
| 47 | else if (c < 32) { |
| 48 | switch (c) { |
| 49 | /* BS, HT, ESC and FF */ |
| 50 | case '\b': case '\t': case '\033': case '\014': |
| 51 | stats->printable++; |
| 52 | break; |
| 53 | default: |
| 54 | stats->nonprintable++; |
| 55 | } |
| 56 | } |
| 57 | else |
| 58 | stats->printable++; |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | /* |
| 63 | * The same heuristics as diff.c::mmfile_is_binary() |
| 64 | */ |
| 65 | static int is_binary(unsigned long size, struct text_stat *stats) |
| 66 | { |
| 67 | |
| 68 | if ((stats->printable >> 7) < stats->nonprintable) |
| 69 | return 1; |
| 70 | /* |
| 71 | * Other heuristics? Average line length might be relevant, |
| 72 | * as might LF vs CR vs CRLF counts.. |
| 73 | * |
| 74 | * NOTE! It might be normal to have a low ratio of CRLF to LF |
| 75 | * (somebody starts with a LF-only file and edits it with an editor |
| 76 | * that adds CRLF only to lines that are added..). But do we |
| 77 | * want to support CR-only? Probably not. |
| 78 | */ |
| 79 | return 0; |
| 80 | } |
| 81 | |
| 82 | static char *crlf_to_git(const char *path, const char *src, unsigned long *sizep, int action) |
| 83 | { |
| 84 | char *buffer, *dst; |
| 85 | unsigned long size, nsize; |
| 86 | struct text_stat stats; |
| 87 | |
| 88 | if ((action == CRLF_BINARY) || (action == CRLF_GUESS && !auto_crlf)) |
| 89 | return NULL; |
| 90 | |
| 91 | size = *sizep; |
| 92 | if (!size) |
| 93 | return NULL; |
| 94 | |
| 95 | gather_stats(src, size, &stats); |
| 96 | |
| 97 | /* No CR? Nothing to convert, regardless. */ |
| 98 | if (!stats.cr) |
| 99 | return NULL; |
| 100 | |
| 101 | if (action == CRLF_GUESS) { |
| 102 | /* |
| 103 | * We're currently not going to even try to convert stuff |
| 104 | * that has bare CR characters. Does anybody do that crazy |
| 105 | * stuff? |
| 106 | */ |
| 107 | if (stats.cr != stats.crlf) |
| 108 | return NULL; |
| 109 | |
| 110 | /* |
| 111 | * And add some heuristics for binary vs text, of course... |
| 112 | */ |
| 113 | if (is_binary(size, &stats)) |
| 114 | return NULL; |
| 115 | } |
| 116 | |
| 117 | /* |
| 118 | * Ok, allocate a new buffer, fill it in, and return it |
| 119 | * to let the caller know that we switched buffers. |
| 120 | */ |
| 121 | nsize = size - stats.crlf; |
| 122 | buffer = xmalloc(nsize); |
| 123 | *sizep = nsize; |
| 124 | |
| 125 | dst = buffer; |
| 126 | if (action == CRLF_GUESS) { |
| 127 | /* |
| 128 | * If we guessed, we already know we rejected a file with |
| 129 | * lone CR, and we can strip a CR without looking at what |
| 130 | * follow it. |
| 131 | */ |
| 132 | do { |
| 133 | unsigned char c = *src++; |
| 134 | if (c != '\r') |
| 135 | *dst++ = c; |
| 136 | } while (--size); |
| 137 | } else { |
| 138 | do { |
| 139 | unsigned char c = *src++; |
| 140 | if (! (c == '\r' && (1 < size && *src == '\n'))) |
| 141 | *dst++ = c; |
| 142 | } while (--size); |
| 143 | } |
| 144 | |
| 145 | return buffer; |
| 146 | } |
| 147 | |
| 148 | static char *crlf_to_worktree(const char *path, const char *src, unsigned long *sizep, int action) |
| 149 | { |
| 150 | char *buffer, *dst; |
| 151 | unsigned long size, nsize; |
| 152 | struct text_stat stats; |
| 153 | unsigned char last; |
| 154 | |
| 155 | if ((action == CRLF_BINARY) || (action == CRLF_INPUT) || |
| 156 | (action == CRLF_GUESS && auto_crlf <= 0)) |
| 157 | return NULL; |
| 158 | |
| 159 | size = *sizep; |
| 160 | if (!size) |
| 161 | return NULL; |
| 162 | |
| 163 | gather_stats(src, size, &stats); |
| 164 | |
| 165 | /* No LF? Nothing to convert, regardless. */ |
| 166 | if (!stats.lf) |
| 167 | return NULL; |
| 168 | |
| 169 | /* Was it already in CRLF format? */ |
| 170 | if (stats.lf == stats.crlf) |
| 171 | return NULL; |
| 172 | |
| 173 | if (action == CRLF_GUESS) { |
| 174 | /* If we have any bare CR characters, we're not going to touch it */ |
| 175 | if (stats.cr != stats.crlf) |
| 176 | return NULL; |
| 177 | |
| 178 | if (is_binary(size, &stats)) |
| 179 | return NULL; |
| 180 | } |
| 181 | |
| 182 | /* |
| 183 | * Ok, allocate a new buffer, fill it in, and return it |
| 184 | * to let the caller know that we switched buffers. |
| 185 | */ |
| 186 | nsize = size + stats.lf - stats.crlf; |
| 187 | buffer = xmalloc(nsize); |
| 188 | *sizep = nsize; |
| 189 | last = 0; |
| 190 | |
| 191 | dst = buffer; |
| 192 | do { |
| 193 | unsigned char c = *src++; |
| 194 | if (c == '\n' && last != '\r') |
| 195 | *dst++ = '\r'; |
| 196 | *dst++ = c; |
| 197 | last = c; |
| 198 | } while (--size); |
| 199 | |
| 200 | return buffer; |
| 201 | } |
| 202 | |
| 203 | static void setup_convert_check(struct git_attr_check *check) |
| 204 | { |
| 205 | static struct git_attr *attr_crlf; |
| 206 | |
| 207 | if (!attr_crlf) |
| 208 | attr_crlf = git_attr("crlf", 4); |
| 209 | check->attr = attr_crlf; |
| 210 | } |
| 211 | |
| 212 | static int git_path_check_crlf(const char *path, struct git_attr_check *check) |
| 213 | { |
| 214 | const char *value = check->value; |
| 215 | |
| 216 | if (ATTR_TRUE(value)) |
| 217 | return CRLF_TEXT; |
| 218 | else if (ATTR_FALSE(value)) |
| 219 | return CRLF_BINARY; |
| 220 | else if (ATTR_UNSET(value)) |
| 221 | ; |
| 222 | else if (!strcmp(value, "input")) |
| 223 | return CRLF_INPUT; |
| 224 | return CRLF_GUESS; |
| 225 | } |
| 226 | |
| 227 | char *convert_to_git(const char *path, const char *src, unsigned long *sizep) |
| 228 | { |
| 229 | struct git_attr_check check[1]; |
| 230 | int crlf = CRLF_GUESS; |
| 231 | |
| 232 | setup_convert_check(check); |
| 233 | if (!git_checkattr(path, 1, check)) { |
| 234 | crlf = git_path_check_crlf(path, check); |
| 235 | } |
| 236 | return crlf_to_git(path, src, sizep, crlf); |
| 237 | } |
| 238 | |
| 239 | char *convert_to_working_tree(const char *path, const char *src, unsigned long *sizep) |
| 240 | { |
| 241 | struct git_attr_check check[1]; |
| 242 | int crlf = CRLF_GUESS; |
| 243 | |
| 244 | setup_convert_check(check); |
| 245 | if (!git_checkattr(path, 1, check)) { |
| 246 | crlf = git_path_check_crlf(path, check); |
| 247 | } |
| 248 | return crlf_to_worktree(path, src, sizep, crlf); |
| 249 | } |