parallel: --halt when,why,num.

This commit is contained in:
Ole Tange 2015-04-27 22:56:26 +02:00
parent 96126e3c46
commit 8110572719
3 changed files with 148 additions and 82 deletions

View file

@ -161,7 +161,7 @@ for(keys %Global::sshmaster) {
kill "TERM", $_; kill "TERM", $_;
} }
::debug("init", "Halt\n"); ::debug("init", "Halt\n");
if($opt::halt) { if($opt::halt and $Global::halt_when ne "never") {
wait_and_exit($Global::halt_exitstatus); wait_and_exit($Global::halt_exitstatus);
} else { } else {
wait_and_exit(min(undef_as_zero($Global::exitstatus),254)); wait_and_exit(min(undef_as_zero($Global::exitstatus),254));
@ -920,8 +920,6 @@ sub parse_options {
} }
$opt::memfree = multiply_binary_prefix($opt::memfree); $opt::memfree = multiply_binary_prefix($opt::memfree);
if(defined $opt::controlmaster) { $opt::noctrlc = 1; } if(defined $opt::controlmaster) { $opt::noctrlc = 1; }
if(defined $opt::halt and
$opt::halt =~ /%/) { $opt::halt /= 100; }
if(defined $opt::timeout and $opt::timeout !~ /^\d+(\.\d+)?%?$/) { if(defined $opt::timeout and $opt::timeout !~ /^\d+(\.\d+)?%?$/) {
::error("--timeout must be seconds or percentage\n"); ::error("--timeout must be seconds or percentage\n");
wait_and_exit(255); wait_and_exit(255);
@ -1036,6 +1034,7 @@ sub parse_options {
} }
citation_notice(); citation_notice();
parse_halt();
parse_sshlogin(); parse_sshlogin();
parse_env_var(); parse_env_var();
@ -1108,6 +1107,53 @@ sub init_globals {
} }
} }
sub parse_halt {
# $opt::halt flavours
# Uses:
# $opt::halt
# $Global::halt_when
# $Global::halt_fail
# $Global::halt_success
# $Global::halt_pct
# $Global::halt_count
if(defined $opt::halt) {
my %halt_expansion = (
"0" => "never",
"1" => "soon,fail=1",
"2" => "now,fail=1",
"-1" => "soon,success=1",
"-2" => "now,success=1",
);
# Expand -2,-1,0,1,2 into long form
$opt::halt = $halt_expansion{$opt::halt} || $opt::halt;
# --halt 5% == --halt soon,fail=5%
$opt::halt =~ s/^(\d+)%$/soon,fail=$1%/;
# Split: soon,fail=5%
my ($when,$fail_success,$pct_count) = split /[,=]/, $opt::halt;
if(not grep { $when eq $_ } qw(never soon now)) {
::error("--halt must have 'never', 'soon', or 'now'");
}
$Global::halt_when = $when;
if($when ne "never") {
if($fail_success eq "fail") {
$Global::halt_fail = 1;
} elsif($fail_success eq "success") {
$Global::halt_success = 1;
} else {
::error("--halt $when must be followed by ,success or ,fail\n");
}
if($pct_count =~ /^(\d+)%$/) {
$Global::halt_pct = $1/100;
} elsif($pct_count =~ /^(\d+)$/) {
$Global::halt_count = $1;
} else {
::error("--halt $when,$fail_success ",
"must be followed by ,number or ,percent%\n");
}
}
}
}
sub parse_replacement_string_options { sub parse_replacement_string_options {
# Deal with --rpl # Deal with --rpl
# Uses: # Uses:
@ -1738,6 +1784,7 @@ sub init_run_jobs {
# Returns: N/A # Returns: N/A
$Global::total_running = 0; $Global::total_running = 0;
$Global::total_started = 0; $Global::total_started = 0;
$Global::total_completed = 0;
$Global::tty_taken = 0; $Global::tty_taken = 0;
$SIG{USR1} = \&list_running_jobs; $SIG{USR1} = \&list_running_jobs;
$SIG{USR2} = \&toggle_progress; $SIG{USR2} = \&toggle_progress;
@ -2229,8 +2276,7 @@ sub progress {
# $avgtime = averaged time # $avgtime = averaged time
# $eta = smoothed eta # $eta = smoothed eta
$total ||= $Global::JobQueue->total_jobs(); $total ||= $Global::JobQueue->total_jobs();
my $completed = 0; my $completed = $Global::total_completed;
for(values %Global::host) { $completed += $_->jobs_completed() }
my $left = $total - $completed; my $left = $total - $completed;
if(not $completed) { if(not $completed) {
return($total, $completed, $left, 0, 0, 0); return($total, $completed, $left, 0, 0, 0);
@ -3005,18 +3051,15 @@ sub reaper {
# Update average runtime for timeout # Update average runtime for timeout
$Global::timeoutq->update_median_runtime($job->runtime()); $Global::timeoutq->update_median_runtime($job->runtime());
} }
# Force printing now if --halt forces us to exit if($opt::keeporder) {
my $print_now =
($opt::halt and
(($opt::halt == 2 and $job->exitstatus())
or
($opt::halt == -2 and not $job->exitstatus())));
if($opt::keeporder and not $print_now) {
$job->print_earlier_jobs(); $job->print_earlier_jobs();
} else { } else {
$job->print(); $job->print();
} }
$job->should_we_halt(); if($job->should_we_halt() eq "now") {
::killall();
::wait_and_exit($Global::exitstatus);
}
} }
my $sshlogin = $job->sshlogin(); my $sshlogin = $job->sshlogin();
$sshlogin->dec_jobs_running(); $sshlogin->dec_jobs_running();
@ -3974,6 +4017,7 @@ sub hostgroups {
sub inc_jobs_completed { sub inc_jobs_completed {
my $self = shift; my $self = shift;
$self->{'jobs_completed'}++; $self->{'jobs_completed'}++;
$Global::total_completed++;
} }
sub set_max_jobs_running { sub set_max_jobs_running {
@ -7433,56 +7477,57 @@ sub set_exitsignal {
# Returns: N/A # Returns: N/A
my $job = shift; my $job = shift;
if($job->exitstatus() or $job->exitsignal()) { if($job->exitstatus() or $job->exitsignal()) {
# Job failed
$Global::exitstatus++; $Global::exitstatus++;
$Global::total_failed++; $Global::total_failed++;
if($opt::halt) { if($Global::halt_fail) {
if($opt::halt == 1 ::status("$Global::progname: This job failed:\n",
$job->replaced(),"\n");
if(($Global::halt_count and
$Global::halt_count <= $Global::total_failed)
or or
($opt::halt > 0 and $opt::halt < 1 and $Global::total_failed > 3 ($Global::halt_pct and
and $Global::halt_pct <=
$Global::total_failed / $Global::total_started > $opt::halt)) { $Global::total_failed / $Global::total_started
# If halt on error == 1 or --halt 10% and $Global::total_failed > 3)) {
# we should gracefully exit # More than N jobs or more than N% failed
$Global::halt_exitstatus = $job->exitstatus();
if($Global::halt_when eq "soon") {
::status ::status
("$Global::progname: Starting no more jobs. ", ("$Global::progname: Starting no more jobs. ",
"Waiting for ", scalar(keys %Global::running), "Waiting for ", scalar(keys %Global::running),
" jobs to finish. This job failed:\n", " jobs to finish.\n");
$job->replaced(),"\n");
$Global::start_no_new_jobs ||= 1; $Global::start_no_new_jobs ||= 1;
$Global::halt_exitstatus = $job->exitstatus();
} elsif($opt::halt == 2) {
# If halt on error == 2 we should exit immediately
if(not $status_printed++) {
::status
("$Global::progname: This job failed:\n",
$job->replaced(),"\n");
} }
::killall(); return($Global::halt_when);
::wait_and_exit($job->exitstatus());
} }
} }
} else { } else {
if($opt::halt) { if($Global::halt_success) {
if($opt::halt == -1) { ::status("$Global::progname: This job succeeded:\n",
# If halt on error == -1 $job->replaced(),"\n");
# we should gracefully exit if(($Global::halt_count and
$Global::halt_count <=
1+$Global::total_completed-$Global::total_failed)
or
($Global::halt_pct and
$Global::halt_pct <=
(1+$Global::total_completed-$Global::total_failed)
/ $Global::total_completed
and ($Global::total_completed-$Global::total_failed) > 3)) {
$Global::halt_exitstatus = $job->exitstatus();
if($Global::halt_when eq "soon") {
::status ::status
("$Global::progname: Starting no more jobs. ", ("$Global::progname: Starting no more jobs. ",
"Waiting for ", scalar(keys %Global::running), "Waiting for ", scalar(keys %Global::running),
" jobs to finish. This job succeeded:\n", " jobs to finish.\n");
$job->replaced(),"\n");
$Global::start_no_new_jobs ||= 1; $Global::start_no_new_jobs ||= 1;
$Global::halt_exitstatus = $job->exitstatus(); }
} elsif($opt::halt == -2) { return($Global::halt_when);
# If halt on error == -2 we should exit immediately
::status
("$Global::progname: This job succeeded:\n",
$job->replaced(),"\n");
::killall();
::wait_and_exit($job->exitstatus());
} }
} }
} }
return "";
} }
} }

View file

@ -10,21 +10,22 @@ export TIMEOUT=$MAX_SEC_PER_TEST
run_test() { run_test() {
script="$1" script="$1"
base=`basename "$script" .sh` base=`basename "$script" .sh`
export TMPDIR=/tmp/$base export TMPDIR=/tmp/"$base"
mkdir -p "$TMPDIR"
if [ "$TRIES" = "3" ] ; then if [ "$TRIES" = "3" ] ; then
# Try 3 times # Try 3 times
bash $script > actual-results/$base bash "$script" > actual-results/"$base"
diff -Naur wanted-results/$base actual-results/$base >/dev/null || diff -Naur wanted-results/"$base" actual-results/"$base" >/dev/null ||
bash $script > actual-results/$base bash "$script" > actual-results/"$base"
diff -Naur wanted-results/$base actual-results/$base >/dev/null || diff -Naur wanted-results/"$base" actual-results/"$base" >/dev/null ||
bash $script > actual-results/$base bash "$script" > actual-results/"$base"
diff -Naur wanted-results/$base actual-results/$base || diff -Naur wanted-results/"$base" actual-results/"$base" ||
(touch $script && echo touch $script) (touch "$script" && echo touch "$script")
else else
# Run only once # Run only once
bash $script > actual-results/$base bash "$script" > actual-results/"$base"
diff -Naur wanted-results/$base actual-results/$base || diff -Naur wanted-results/"$base" actual-results/"$base" ||
(touch $script && echo touch $script) (touch "$script" && echo touch "$script")
fi fi
# Check if it was cleaned up # Check if it was cleaned up

View file

@ -17,13 +17,14 @@ echo '### Test --halt-on-error 1'; (echo "sleep 1;true"; echo "sleep 2;false";
### Test --halt-on-error 1 ### Test --halt-on-error 1
1 1
127 127
parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job failed: parallel: This job failed:
sleep 2;false sleep 2;false
parallel: Starting no more jobs. Waiting for 3 jobs to finish. This job failed: parallel: Starting no more jobs. Waiting for 3 jobs to finish. This job failed:
sleep 2;false sleep 2;false
/bin/bash: non_exist: command not found /bin/bash: non_exist: command not found
parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed: parallel: This job failed:
sleep 4; non_exist sleep 4; non_exist
parallel: Starting no more jobs. Waiting for 1 jobs to finish.
echo '**' echo '**'
** **
echo '### Test --halt-on-error 2'; (echo "sleep 1;true"; echo "sleep 2;false";echo "sleep 3;true") | parallel -j10 --halt-on-error 2; echo $?; (echo "sleep 1;true"; echo "sleep 2;false";echo "sleep 3;true";echo "sleep 4; non_exist") | parallel -j10 --halt 2; echo $? echo '### Test --halt-on-error 2'; (echo "sleep 1;true"; echo "sleep 2;false";echo "sleep 3;true") | parallel -j10 --halt-on-error 2; echo $?; (echo "sleep 1;true"; echo "sleep 2;false";echo "sleep 3;true";echo "sleep 4; non_exist") | parallel -j10 --halt 2; echo $?
@ -40,10 +41,11 @@ echo '### Test --halt -1'; (echo "sleep 1;false"; echo "sleep 2;true";echo "sl
### Test --halt -1 ### Test --halt -1
0 0
0 0
parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job succeeded: parallel: This job succeeded:
sleep 2;true sleep 2;true
parallel: Starting no more jobs. Waiting for 3 jobs to finish. This job succeeded: parallel: Starting no more jobs. Waiting for 3 jobs to finish. This job succeeded:
sleep 2;true sleep 2;true
parallel: Starting no more jobs. Waiting for 3 jobs to finish.
/bin/bash: non_exist: command not found /bin/bash: non_exist: command not found
echo '**' echo '**'
** **
@ -62,33 +64,42 @@ echo '### Test last dying print --halt-on-error 1'; (seq 0 8;echo 0; echo 9) |
exit code 9 exit code 9
0 0
1 1
parallel: Starting no more jobs. Waiting for 9 jobs to finish. This job failed: parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 1 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 1
parallel: Starting no more jobs. Waiting for 9 jobs to finish.
2 2
parallel: Starting no more jobs. Waiting for 8 jobs to finish. This job failed: parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 2 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 2
parallel: Starting no more jobs. Waiting for 8 jobs to finish.
3 3
parallel: Starting no more jobs. Waiting for 7 jobs to finish. This job failed: parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 3 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 3
parallel: Starting no more jobs. Waiting for 7 jobs to finish.
4 4
parallel: Starting no more jobs. Waiting for 6 jobs to finish. This job failed: parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 4 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 4
parallel: Starting no more jobs. Waiting for 6 jobs to finish.
5 5
parallel: Starting no more jobs. Waiting for 5 jobs to finish. This job failed: parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 5 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 5
parallel: Starting no more jobs. Waiting for 5 jobs to finish.
6 6
parallel: Starting no more jobs. Waiting for 4 jobs to finish. This job failed: parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 6 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 6
parallel: Starting no more jobs. Waiting for 4 jobs to finish.
7 7
parallel: Starting no more jobs. Waiting for 3 jobs to finish. This job failed: parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 7 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 7
parallel: Starting no more jobs. Waiting for 3 jobs to finish.
8 8
0 0
parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job failed: parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 8 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 8
parallel: Starting no more jobs. Waiting for 2 jobs to finish.
9 9
parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed: parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 9 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 9
parallel: Starting no more jobs. Waiting for 1 jobs to finish.
echo '### Test last dying print --halt-on-error 2'; (seq 0 8;echo 0; echo 9) | parallel -j10 -kq --halt 2 perl -e 'sleep $ARGV[0];print STDERR @ARGV,"\n"; exit shift'; echo exit code $? echo '### Test last dying print --halt-on-error 2'; (seq 0 8;echo 0; echo 9) | parallel -j10 -kq --halt 2 perl -e 'sleep $ARGV[0];print STDERR @ARGV,"\n"; exit shift'; echo exit code $?
### Test last dying print --halt-on-error 2 ### Test last dying print --halt-on-error 2
exit code 1 exit code 1
@ -101,33 +112,42 @@ echo '### Test last dying print --halt-on-error -1'; (seq 0 8;echo 0; echo 9)
exit code 0 exit code 0
0 0
1 1
parallel: Starting no more jobs. Waiting for 9 jobs to finish. This job succeeded: parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 1 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 1
parallel: Starting no more jobs. Waiting for 9 jobs to finish.
2 2
parallel: Starting no more jobs. Waiting for 8 jobs to finish. This job succeeded: parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 2 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 2
parallel: Starting no more jobs. Waiting for 8 jobs to finish.
3 3
parallel: Starting no more jobs. Waiting for 7 jobs to finish. This job succeeded: parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 3 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 3
parallel: Starting no more jobs. Waiting for 7 jobs to finish.
4 4
parallel: Starting no more jobs. Waiting for 6 jobs to finish. This job succeeded: parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 4 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 4
parallel: Starting no more jobs. Waiting for 6 jobs to finish.
5 5
parallel: Starting no more jobs. Waiting for 5 jobs to finish. This job succeeded: parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 5 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 5
parallel: Starting no more jobs. Waiting for 5 jobs to finish.
6 6
parallel: Starting no more jobs. Waiting for 4 jobs to finish. This job succeeded: parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 6 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 6
parallel: Starting no more jobs. Waiting for 4 jobs to finish.
7 7
parallel: Starting no more jobs. Waiting for 3 jobs to finish. This job succeeded: parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 7 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 7
parallel: Starting no more jobs. Waiting for 3 jobs to finish.
8 8
0 0
parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job succeeded: parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 8 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 8
parallel: Starting no more jobs. Waiting for 2 jobs to finish.
9 9
parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job succeeded: parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 9 perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 9
parallel: Starting no more jobs. Waiting for 1 jobs to finish.
echo '### Test last dying print --halt-on-error -2'; (seq 0 8;echo 0; echo 9) | parallel -j10 -kq --halt -2 perl -e 'sleep $ARGV[0];print STDERR @ARGV,"\n"; exit not shift'; echo exit code $? echo '### Test last dying print --halt-on-error -2'; (seq 0 8;echo 0; echo 9) | parallel -j10 -kq --halt -2 perl -e 'sleep $ARGV[0];print STDERR @ARGV,"\n"; exit not shift'; echo exit code $?
### Test last dying print --halt-on-error -2 ### Test last dying print --halt-on-error -2
exit code 0 exit code 0