[PATCH] archimport autodetects import status, supports incremental imports
[git/git.git] / git-archimport-script
CommitLineData
d3968363
ML
1#!/usr/bin/perl -w
2#
3# This tool is copyright (c) 2005, Martin Langhoff.
4# It is released under the Gnu Public License, version 2.
5#
6# The basic idea is to walk the output of tla abrowse,
7# fetch the changesets and apply them.
8#
9=head1 Invocation
10
11 git-archimport-script -i <archive>/<branch> [<archive>/<branch>]
12 [ <archive>/<branch> ]
13
14 The script expects you to provide the key roots where it can start the
15 import from an 'initial import' or 'tag' type of Arch commit. It will
16 then follow all the branching and tagging within the provided roots.
17
18 It will die if it sees branches that have different roots.
19
20=head2 TODO
21
22 - keep track of merged patches, and mark a git merge when it happens
23 - smarter rules to parse the archive history "up" and "down"
24 - be able to continue an import where we left off
25 - audit shell-escaping of filenames
26
27=head1 Devel tricks
28
29Add print in front of the shell commands invoked via backticks.
30
31=cut
32
33use strict;
34use warnings;
35use Getopt::Std;
36use File::Spec;
37use File::Temp qw(tempfile);
38use File::Path qw(mkpath);
39use File::Basename qw(basename dirname);
40use String::ShellQuote;
41use Time::Local;
42use IO::Socket;
43use IO::Pipe;
44use POSIX qw(strftime dup2);
45use Data::Dumper qw/ Dumper /;
46use IPC::Open2;
47
48$SIG{'PIPE'}="IGNORE";
49$ENV{'TZ'}="UTC";
50
51our($opt_h,$opt_v, $opt_T,
3292ae47 52 $opt_C,$opt_t);
d3968363
ML
53
54sub usage() {
55 print STDERR <<END;
56Usage: ${\basename $0} # fetch/update GIT from Arch
3292ae47 57 [ -h ] [ -v ] [ -T ]
d3968363
ML
58 [ -C GIT_repository ] [ -t tempdir ]
59 repository/arch-branch [ repository/arch-branch] ...
60END
61 exit(1);
62}
63
64getopts("hviC:t:") or usage();
65usage if $opt_h;
66
67@ARGV >= 1 or usage();
68my @arch_roots = @ARGV;
69
70my $tmp = $opt_t;
71$tmp ||= '/tmp';
72$tmp .= '/git-archimport/';
73
74my $git_tree = $opt_C;
75$git_tree ||= ".";
76
77
78my @psets = (); # the collection
79
80foreach my $root (@arch_roots) {
81 my ($arepo, $abranch) = split(m!/!, $root);
82 open ABROWSE, "tla abrowse -f -A $arepo --desc --merges $abranch |"
83 or die "Problems with tla abrowse: $!";
84
85 my %ps = (); # the current one
86 my $mode = '';
87 my $lastseen = '';
88
89 while (<ABROWSE>) {
90 chomp;
91
92 # first record padded w 8 spaces
93 if (s/^\s{8}\b//) {
94
95 # store the record we just captured
96 if (%ps) {
97 my %temp = %ps; # break references
98 push (@psets, \%temp);
99 %ps = ();
100 }
101
102 my ($id, $type) = split(m/\s{3}/, $_);
103 $ps{id} = $id;
104 $ps{repo} = $arepo;
105
106 # deal with types
107 if ($type =~ m/^\(simple changeset\)/) {
108 $ps{type} = 's';
109 } elsif ($type eq '(initial import)') {
110 $ps{type} = 'i';
111 } elsif ($type =~ m/^\(tag revision of (.+)\)/) {
112 $ps{type} = 't';
113 $ps{tag} = $1;
114 } else {
115 warn "Unknown type $type";
116 }
117 $lastseen = 'id';
118 }
119
120 if (s/^\s{10}//) {
121 # 10 leading spaces or more
122 # indicate commit metadata
123
124 # date & author
125 if ($lastseen eq 'id' && m/^\d{4}-\d{2}-\d{2}/) {
126
127 my ($date, $authoremail) = split(m/\s{2,}/, $_);
128 $ps{date} = $date;
129 $ps{date} =~ s/\bGMT$//; # strip off trailign GMT
130 if ($ps{date} =~ m/\b\w+$/) {
131 warn 'Arch dates not in GMT?! - imported dates will be wrong';
132 }
133
134 $authoremail =~ m/^(.+)\s(\S+)$/;
135 $ps{author} = $1;
136 $ps{email} = $2;
137
138 $lastseen = 'date';
139
140 } elsif ($lastseen eq 'date') {
141 # the only hint is position
142 # subject is after date
143 $ps{subj} = $_;
144 $lastseen = 'subj';
145
146 } elsif ($lastseen eq 'subj' && $_ eq 'merges in:') {
147 $ps{merges} = [];
148 $lastseen = 'merges';
149
150 } elsif ($lastseen eq 'merges' && s/^\s{2}//) {
151 push (@{$ps{merges}}, $_);
152 } else {
153 warn 'more metadata after merges!?';
154 }
155
156 }
157 }
158
159 if (%ps) {
160 my %temp = %ps; # break references
161 push (@psets, \%temp);
162 %ps = ();
163 }
164 close ABROWSE;
165} # end foreach $root
166
167## Order patches by time
168@psets = sort {$a->{date}.$b->{id} cmp $b->{date}.$b->{id}} @psets;
169
170#print Dumper \@psets;
171
172##
173## TODO cleanup irrelevant patches
174## and put an initial import
175## or a full tag
3292ae47
ML
176my $import = 0;
177unless (-d '.git') { # initial import
d3968363
ML
178 if ($psets[0]{type} eq 'i' || $psets[0]{type} eq 't') {
179 print "Starting import from $psets[0]{id}\n";
3292ae47
ML
180 `git-init-db`;
181 die $! if $?;
182 $import = 1;
d3968363
ML
183 } else {
184 die "Need to start from an import or a tag -- cannot use $psets[0]{id}";
185 }
d3968363
ML
186}
187
3292ae47 188# process patchsets
d3968363
ML
189foreach my $ps (@psets) {
190
191 $ps->{branch} = branchname($ps->{id});
192
193 #
194 # ensure we have a clean state
195 #
196 if (`git diff-files`) {
197 die "Unclean tree when about to process $ps->{id} " .
198 " - did we fail to commit cleanly before?";
199 }
200 die $! if $?;
201
3292ae47
ML
202 #
203 # skip commits already in repo
204 #
205 if (ptag($ps->{id})) {
206 $opt_v && print "Skipping already imported: $ps->{id}\n";
207 next;
208 }
209
d3968363
ML
210 #
211 # create the branch if needed
212 #
3292ae47
ML
213 if ($ps->{type} eq 'i' && !$import) {
214 die "Should not have more than one 'Initial import' per GIT import: $ps->{id}";
d3968363
ML
215 }
216
3292ae47 217 unless ($import) { # skip for import
d3968363
ML
218 if ( -e ".git/refs/heads/$ps->{branch}") {
219 # we know about this branch
220 `git checkout $ps->{branch}`;
221 } else {
222 # new branch! we need to verify a few things
223 die "Branch on a non-tag!" unless $ps->{type} eq 't';
224 my $branchpoint = ptag($ps->{tag});
225 die "Tagging from unknown id unsupported: $ps->{tag}"
226 unless $branchpoint;
227
228 # find where we are supposed to branch from
229 `git checkout -b $ps->{branch} $branchpoint`;
230 }
231 die $! if $?;
232 }
233
d3968363
ML
234 #
235 # Apply the import/changeset/merge into the working tree
236 #
237 if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
d3968363 238 apply_import($ps) or die $!;
3292ae47 239 $import=0;
d3968363
ML
240 } elsif ($ps->{type} eq 's') {
241 apply_cset($ps);
242 }
243
244 #
245 # prepare update git's index, based on what arch knows
246 # about the pset, resolve parents, etc
247 #
248 my $tree;
249
250 my $commitlog = `tla cat-archive-log -A $ps->{repo} $ps->{id}`;
251 die "Error in cat-archive-log: $!" if $?;
252
253 # parselog will git-add/rm files
254 # and generally prepare things for the commit
255 # NOTE: parselog will shell-quote filenames!
256 my ($sum, $msg, $add, $del, $mod, $ren) = parselog($commitlog);
257 my $logmessage = "$sum\n$msg";
258
259
260 # imports don't give us good info
261 # on added files. Shame on them
262 if ($ps->{type} eq 'i' || $ps->{type} eq 't') {
263 `find . -type f -print0 | grep -zv '^./.git' | xargs -0 -l100 git-update-cache --add`;
264 `git-ls-files --deleted -z | xargs --no-run-if-empty -0 -l100 git-update-cache --remove`;
265 }
266
267 if (@$add) {
268 while (@$add) {
269 my @slice = splice(@$add, 0, 100);
270 my $slice = join(' ', @slice);
271 `git-update-cache --add $slice`;
272 die "Error in git-update-cache --add: $!" if $?;
273 }
274 }
275 if (@$del) {
276 foreach my $file (@$del) {
277 unlink $file or die "Problems deleting $file : $!";
278 }
279 while (@$del) {
280 my @slice = splice(@$del, 0, 100);
281 my $slice = join(' ', @slice);
282 `git-update-cache --remove $slice`;
283 die "Error in git-update-cache --remove: $!" if $?;
284 }
285 }
286 if (@$ren) { # renamed
287 if (@$ren % 2) {
288 die "Odd number of entries in rename!?";
289 }
290 ;
291 while (@$ren) {
292 my $from = pop @$ren;
293 my $to = pop @$ren;
294
295 unless (-d dirname($to)) {
296 mkpath(dirname($to)); # will die on err
297 }
298 #print "moving $from $to";
299 `mv $from $to`;
300 die "Error renaming $from $to : $!" if $?;
301 `git-update-cache --remove $from`;
302 die "Error in git-update-cache --remove: $!" if $?;
303 `git-update-cache --add $to`;
304 die "Error in git-update-cache --add: $!" if $?;
305 }
306
307 }
308 if (@$mod) { # must be _after_ renames
309 while (@$mod) {
310 my @slice = splice(@$mod, 0, 100);
311 my $slice = join(' ', @slice);
312 `git-update-cache $slice`;
313 die "Error in git-update-cache: $!" if $?;
314 }
315 }
316
317 # warn "errors when running git-update-cache! $!";
318 $tree = `git-write-tree`;
319 die "cannot write tree $!" if $?;
320 chomp $tree;
321
322
323 #
324 # Who's your daddy?
325 #
326 my @par;
327 if ( -e ".git/refs/heads/$ps->{branch}") {
328 if (open HEAD, "<.git/refs/heads/$ps->{branch}") {
329 my $p = <HEAD>;
330 close HEAD;
331 chomp $p;
332 push @par, '-p', $p;
333 } else {
334 if ($ps->{type} eq 's') {
335 warn "Could not find the right head for the branch $ps->{branch}";
336 }
337 }
338 }
339
340 my $par = join (' ', @par);
341
342 #
343 # Commit, tag and clean state
344 #
345 $ENV{TZ} = 'GMT';
346 $ENV{GIT_AUTHOR_NAME} = $ps->{author};
347 $ENV{GIT_AUTHOR_EMAIL} = $ps->{email};
348 $ENV{GIT_AUTHOR_DATE} = $ps->{date};
349 $ENV{GIT_COMMITTER_NAME} = $ps->{author};
350 $ENV{GIT_COMMITTER_EMAIL} = $ps->{email};
351 $ENV{GIT_COMMITTER_DATE} = $ps->{date};
352
353 my ($pid, $commit_rh, $commit_wh);
354 $commit_rh = 'commit_rh';
355 $commit_wh = 'commit_wh';
356
357 $pid = open2(*READER, *WRITER, "git-commit-tree $tree $par")
358 or die $!;
359 print WRITER $logmessage; # write
360 close WRITER;
361 my $commitid = <READER>; # read
362 chomp $commitid;
363 close READER;
364 waitpid $pid,0; # close;
365
366 if (length $commitid != 40) {
367 die "Something went wrong with the commit! $! $commitid";
368 }
369 #
370 # Update the branch
371 #
372 open HEAD, ">.git/refs/heads/$ps->{branch}";
373 print HEAD $commitid;
374 close HEAD;
375 unlink ('.git/HEAD');
376 symlink("refs/heads/$ps->{branch}",".git/HEAD");
377
378 # tag accordingly
379 ptag($ps->{id}, $commitid); # private tag
380 if ($opt_T || $ps->{type} eq 't' || $ps->{type} eq 'i') {
381 tag($ps->{id}, $commitid);
382 }
383 print " * Committed $ps->{id}\n";
384 print " + tree $tree\n";
385 print " + commit $commitid\n";
386 # print " + commit date is $ps->{date} \n";
387}
388
389sub branchname {
390 my $id = shift;
391 $id =~ s#^.+?/##;
392 my @parts = split(m/--/, $id);
393 return join('--', @parts[0..1]);
394}
395
396sub apply_import {
397 my $ps = shift;
398 my $bname = branchname($ps->{id});
399
400 `mkdir -p $tmp`;
401
402 `tla get -s --no-pristine -A $ps->{repo} $ps->{id} $tmp/import`;
403 die "Cannot get import: $!" if $?;
404 `rsync -v --archive --delete --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/import/* ./`;
405 die "Cannot rsync import:$!" if $?;
406
407 `rm -fr $tmp/import`;
408 die "Cannot remove tempdir: $!" if $?;
409
410
411 return 1;
412}
413
414sub apply_cset {
415 my $ps = shift;
416
417 `mkdir -p $tmp`;
418
419 # get the changeset
420 `tla get-changeset -A $ps->{repo} $ps->{id} $tmp/changeset`;
421 die "Cannot get changeset: $!" if $?;
422
423 # apply patches
424 if (`find $tmp/changeset/patches -type f -name '*.patch'`) {
425 # this can be sped up considerably by doing
426 # (find | xargs cat) | patch
427 # but that cna get mucked up by patches
428 # with missing trailing newlines or the standard
429 # 'missing newline' flag in the patch - possibly
430 # produced with an old/buggy diff.
431 # slow and safe, we invoke patch once per patchfile
432 `find $tmp/changeset/patches -type f -name '*.patch' -print0 | grep -zv '{arch}' | xargs -iFILE -0 --no-run-if-empty patch -p1 --forward -iFILE`;
433 die "Problem applying patches! $!" if $?;
434 }
435
436 # apply changed binary files
437 if (my @modified = `find $tmp/changeset/patches -type f -name '*.modified'`) {
438 foreach my $mod (@modified) {
439 chomp $mod;
440 my $orig = $mod;
441 $orig =~ s/\.modified$//; # lazy
442 $orig =~ s!^\Q$tmp\E/changeset/patches/!!;
443 #print "rsync -p '$mod' '$orig'";
444 `rsync -p $mod ./$orig`;
445 die "Problem applying binary changes! $!" if $?;
446 }
447 }
448
449 # bring in new files
450 `rsync --archive --exclude '.git' --exclude '.arch-ids' --exclude '{arch}' $tmp/changeset/new-files-archive/* ./`;
451
452 # deleted files are hinted from the commitlog processing
453
454 `rm -fr $tmp/changeset`;
455}
456
457
458# =for reference
459# A log entry looks like
460# Revision: moodle-org--moodle--1.3.3--patch-15
461# Archive: arch-eduforge@catalyst.net.nz--2004
462# Creator: Penny Leach <penny@catalyst.net.nz>
463# Date: Wed May 25 14:15:34 NZST 2005
464# Standard-date: 2005-05-25 02:15:34 GMT
465# New-files: lang/de/.arch-ids/block_glossary_random.php.id
466# lang/de/.arch-ids/block_html.php.id
467# New-directories: lang/de/help/questionnaire
468# lang/de/help/questionnaire/.arch-ids
469# Renamed-files: .arch-ids/db_sears.sql.id db/.arch-ids/db_sears.sql.id
470# db_sears.sql db/db_sears.sql
471# Removed-files: lang/be/docs/.arch-ids/release.html.id
472# lang/be/docs/.arch-ids/releaseold.html.id
473# Modified-files: admin/cron.php admin/delete.php
474# admin/editor.html backup/lib.php backup/restore.php
475# New-patches: arch-eduforge@catalyst.net.nz--2004/moodle-org--moodle--1.3.3--patch-15
476# Summary: Updating to latest from MOODLE_14_STABLE (1.4.5+)
477# Keywords:
478#
479# Updating yadda tadda tadda madda
480sub parselog {
481 my $log = shift;
482 #print $log;
483
484 my (@add, @del, @mod, @ren, @kw, $sum, $msg );
485
486 if ($log =~ m/(?:\n|^)New-files:(.*?)(?=\n\w)/s ) {
487 my $files = $1;
488 @add = split(m/\s+/s, $files);
489 }
490
491 if ($log =~ m/(?:\n|^)Removed-files:(.*?)(?=\n\w)/s ) {
492 my $files = $1;
493 @del = split(m/\s+/s, $files);
494 }
495
496 if ($log =~ m/(?:\n|^)Modified-files:(.*?)(?=\n\w)/s ) {
497 my $files = $1;
498 @mod = split(m/\s+/s, $files);
499 }
500
501 if ($log =~ m/(?:\n|^)Renamed-files:(.*?)(?=\n\w)/s ) {
502 my $files = $1;
503 @ren = split(m/\s+/s, $files);
504 }
505
506 $sum ='';
507 if ($log =~ m/^Summary:(.+?)$/m ) {
508 $sum = $1;
509 $sum =~ s/^\s+//;
510 $sum =~ s/\s+$//;
511 }
512
513 $msg = '';
514 if ($log =~ m/\n\n(.+)$/s) {
515 $msg = $1;
516 $msg =~ s/^\s+//;
517 $msg =~ s/\s+$//;
518 }
519
520
521 # cleanup the arrays
522 foreach my $ref ( (\@add, \@del, \@mod, \@ren) ) {
523 my @tmp = ();
524 while (my $t = pop @$ref) {
525 next unless length ($t);
526 next if $t =~ m!\{arch\}/!;
527 next if $t =~ m!\.arch-ids/!;
528 next if $t =~ m!\.arch-inventory$!;
529 push (@tmp, shell_quote($t));
530 }
531 @$ref = @tmp;
532 }
533
534 #print Dumper [$sum, $msg, \@add, \@del, \@mod, \@ren];
535 return ($sum, $msg, \@add, \@del, \@mod, \@ren);
536}
537
538# write/read a tag
539sub tag {
540 my ($tag, $commit) = @_;
541 $tag =~ s|/|--|g;
542 $tag = shell_quote($tag);
543
544 if ($commit) {
545 open(C,">.git/refs/tags/$tag")
546 or die "Cannot create tag $tag: $!\n";
547 print C "$commit\n"
548 or die "Cannot write tag $tag: $!\n";
549 close(C)
550 or die "Cannot write tag $tag: $!\n";
551 print "Created tag '$tag' on '$commit'\n" if $opt_v;
552 } else { # read
553 open(C,"<.git/refs/tags/$tag")
554 or die "Cannot read tag $tag: $!\n";
555 $commit = <C>;
556 chomp $commit;
557 die "Error reading tag $tag: $!\n" unless length $commit == 40;
558 close(C)
559 or die "Cannot read tag $tag: $!\n";
560 return $commit;
561 }
562}
563
564# write/read a private tag
565# reads fail softly if the tag isn't there
566sub ptag {
567 my ($tag, $commit) = @_;
568 $tag =~ s|/|--|g;
569 $tag = shell_quote($tag);
570
571 unless (-d '.git/archimport/tags') {
572 mkpath('.git/archimport/tags');
573 }
574
575 if ($commit) { # write
576 open(C,">.git/archimport/tags/$tag")
577 or die "Cannot create tag $tag: $!\n";
578 print C "$commit\n"
579 or die "Cannot write tag $tag: $!\n";
580 close(C)
581 or die "Cannot write tag $tag: $!\n";
582 } else { # read
583 # if the tag isn't there, return 0
584 unless ( -s ".git/archimport/tags/$tag") {
d3968363
ML
585 return 0;
586 }
587 open(C,"<.git/archimport/tags/$tag")
588 or die "Cannot read tag $tag: $!\n";
589 $commit = <C>;
590 chomp $commit;
591 die "Error reading tag $tag: $!\n" unless length $commit == 40;
592 close(C)
593 or die "Cannot read tag $tag: $!\n";
594 return $commit;
595 }
596}