parallel: --halt when,why,num.

This commit is contained in:
Ole Tange 2015-04-27 22:56:26 +02:00
parent 96126e3c46
commit 8110572719
3 changed files with 148 additions and 82 deletions

View file

@ -161,7 +161,7 @@ for(keys %Global::sshmaster) {
kill "TERM", $_;
}
::debug("init", "Halt\n");
if($opt::halt) {
if($opt::halt and $Global::halt_when ne "never") {
wait_and_exit($Global::halt_exitstatus);
} else {
wait_and_exit(min(undef_as_zero($Global::exitstatus),254));
@ -920,8 +920,6 @@ sub parse_options {
}
$opt::memfree = multiply_binary_prefix($opt::memfree);
if(defined $opt::controlmaster) { $opt::noctrlc = 1; }
if(defined $opt::halt and
$opt::halt =~ /%/) { $opt::halt /= 100; }
if(defined $opt::timeout and $opt::timeout !~ /^\d+(\.\d+)?%?$/) {
::error("--timeout must be seconds or percentage\n");
wait_and_exit(255);
@ -1036,6 +1034,7 @@ sub parse_options {
}
citation_notice();
parse_halt();
parse_sshlogin();
parse_env_var();
@ -1108,6 +1107,53 @@ sub init_globals {
}
}
sub parse_halt {
# $opt::halt flavours
# Uses:
# $opt::halt
# $Global::halt_when
# $Global::halt_fail
# $Global::halt_success
# $Global::halt_pct
# $Global::halt_count
if(defined $opt::halt) {
my %halt_expansion = (
"0" => "never",
"1" => "soon,fail=1",
"2" => "now,fail=1",
"-1" => "soon,success=1",
"-2" => "now,success=1",
);
# Expand -2,-1,0,1,2 into long form
$opt::halt = $halt_expansion{$opt::halt} || $opt::halt;
# --halt 5% == --halt soon,fail=5%
$opt::halt =~ s/^(\d+)%$/soon,fail=$1%/;
# Split: soon,fail=5%
my ($when,$fail_success,$pct_count) = split /[,=]/, $opt::halt;
if(not grep { $when eq $_ } qw(never soon now)) {
::error("--halt must have 'never', 'soon', or 'now'");
}
$Global::halt_when = $when;
if($when ne "never") {
if($fail_success eq "fail") {
$Global::halt_fail = 1;
} elsif($fail_success eq "success") {
$Global::halt_success = 1;
} else {
::error("--halt $when must be followed by ,success or ,fail\n");
}
if($pct_count =~ /^(\d+)%$/) {
$Global::halt_pct = $1/100;
} elsif($pct_count =~ /^(\d+)$/) {
$Global::halt_count = $1;
} else {
::error("--halt $when,$fail_success ",
"must be followed by ,number or ,percent%\n");
}
}
}
}
sub parse_replacement_string_options {
# Deal with --rpl
# Uses:
@ -1738,6 +1784,7 @@ sub init_run_jobs {
# Returns: N/A
$Global::total_running = 0;
$Global::total_started = 0;
$Global::total_completed = 0;
$Global::tty_taken = 0;
$SIG{USR1} = \&list_running_jobs;
$SIG{USR2} = \&toggle_progress;
@ -2229,8 +2276,7 @@ sub progress {
# $avgtime = averaged time
# $eta = smoothed eta
$total ||= $Global::JobQueue->total_jobs();
my $completed = 0;
for(values %Global::host) { $completed += $_->jobs_completed() }
my $completed = $Global::total_completed;
my $left = $total - $completed;
if(not $completed) {
return($total, $completed, $left, 0, 0, 0);
@ -3005,18 +3051,15 @@ sub reaper {
# Update average runtime for timeout
$Global::timeoutq->update_median_runtime($job->runtime());
}
# Force printing now if --halt forces us to exit
my $print_now =
($opt::halt and
(($opt::halt == 2 and $job->exitstatus())
or
($opt::halt == -2 and not $job->exitstatus())));
if($opt::keeporder and not $print_now) {
if($opt::keeporder) {
$job->print_earlier_jobs();
} else {
$job->print();
$job->print();
}
if($job->should_we_halt() eq "now") {
::killall();
::wait_and_exit($Global::exitstatus);
}
$job->should_we_halt();
}
my $sshlogin = $job->sshlogin();
$sshlogin->dec_jobs_running();
@ -3974,6 +4017,7 @@ sub hostgroups {
sub inc_jobs_completed {
my $self = shift;
$self->{'jobs_completed'}++;
$Global::total_completed++;
}
sub set_max_jobs_running {
@ -7433,56 +7477,57 @@ sub set_exitsignal {
# Returns: N/A
my $job = shift;
if($job->exitstatus() or $job->exitsignal()) {
# Job failed
$Global::exitstatus++;
$Global::total_failed++;
if($opt::halt) {
if($opt::halt == 1
or
($opt::halt > 0 and $opt::halt < 1 and $Global::total_failed > 3
and
$Global::total_failed / $Global::total_started > $opt::halt)) {
# If halt on error == 1 or --halt 10%
# we should gracefully exit
::status
("$Global::progname: Starting no more jobs. ",
"Waiting for ", scalar(keys %Global::running),
" jobs to finish. This job failed:\n",
if($Global::halt_fail) {
::status("$Global::progname: This job failed:\n",
$job->replaced(),"\n");
$Global::start_no_new_jobs ||= 1;
if(($Global::halt_count and
$Global::halt_count <= $Global::total_failed)
or
($Global::halt_pct and
$Global::halt_pct <=
$Global::total_failed / $Global::total_started
and $Global::total_failed > 3)) {
# More than N jobs or more than N% failed
$Global::halt_exitstatus = $job->exitstatus();
} elsif($opt::halt == 2) {
# If halt on error == 2 we should exit immediately
if(not $status_printed++) {
if($Global::halt_when eq "soon") {
::status
("$Global::progname: This job failed:\n",
$job->replaced(),"\n");
("$Global::progname: Starting no more jobs. ",
"Waiting for ", scalar(keys %Global::running),
" jobs to finish.\n");
$Global::start_no_new_jobs ||= 1;
}
::killall();
::wait_and_exit($job->exitstatus());
return($Global::halt_when);
}
}
} else {
if($opt::halt) {
if($opt::halt == -1) {
# If halt on error == -1
# we should gracefully exit
::status
("$Global::progname: Starting no more jobs. ",
"Waiting for ", scalar(keys %Global::running),
" jobs to finish. This job succeeded:\n",
if($Global::halt_success) {
::status("$Global::progname: This job succeeded:\n",
$job->replaced(),"\n");
$Global::start_no_new_jobs ||= 1;
if(($Global::halt_count and
$Global::halt_count <=
1+$Global::total_completed-$Global::total_failed)
or
($Global::halt_pct and
$Global::halt_pct <=
(1+$Global::total_completed-$Global::total_failed)
/ $Global::total_completed
and ($Global::total_completed-$Global::total_failed) > 3)) {
$Global::halt_exitstatus = $job->exitstatus();
} elsif($opt::halt == -2) {
# If halt on error == -2 we should exit immediately
::status
("$Global::progname: This job succeeded:\n",
$job->replaced(),"\n");
::killall();
::wait_and_exit($job->exitstatus());
if($Global::halt_when eq "soon") {
::status
("$Global::progname: Starting no more jobs. ",
"Waiting for ", scalar(keys %Global::running),
" jobs to finish.\n");
$Global::start_no_new_jobs ||= 1;
}
return($Global::halt_when);
}
}
}
return "";
}
}

View file

@ -10,21 +10,22 @@ export TIMEOUT=$MAX_SEC_PER_TEST
run_test() {
script="$1"
base=`basename "$script" .sh`
export TMPDIR=/tmp/$base
export TMPDIR=/tmp/"$base"
mkdir -p "$TMPDIR"
if [ "$TRIES" = "3" ] ; then
# Try 3 times
bash $script > actual-results/$base
diff -Naur wanted-results/$base actual-results/$base >/dev/null ||
bash $script > actual-results/$base
diff -Naur wanted-results/$base actual-results/$base >/dev/null ||
bash $script > actual-results/$base
diff -Naur wanted-results/$base actual-results/$base ||
(touch $script && echo touch $script)
bash "$script" > actual-results/"$base"
diff -Naur wanted-results/"$base" actual-results/"$base" >/dev/null ||
bash "$script" > actual-results/"$base"
diff -Naur wanted-results/"$base" actual-results/"$base" >/dev/null ||
bash "$script" > actual-results/"$base"
diff -Naur wanted-results/"$base" actual-results/"$base" ||
(touch "$script" && echo touch "$script")
else
# Run only once
bash $script > actual-results/$base
diff -Naur wanted-results/$base actual-results/$base ||
(touch $script && echo touch $script)
bash "$script" > actual-results/"$base"
diff -Naur wanted-results/"$base" actual-results/"$base" ||
(touch "$script" && echo touch "$script")
fi
# Check if it was cleaned up

View file

@ -17,13 +17,14 @@ echo '### Test --halt-on-error 1'; (echo "sleep 1;true"; echo "sleep 2;false";
### Test --halt-on-error 1
1
127
parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job failed:
parallel: This job failed:
sleep 2;false
parallel: Starting no more jobs. Waiting for 3 jobs to finish. This job failed:
sleep 2;false
/bin/bash: non_exist: command not found
parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed:
parallel: This job failed:
sleep 4; non_exist
parallel: Starting no more jobs. Waiting for 1 jobs to finish.
echo '**'
**
echo '### Test --halt-on-error 2'; (echo "sleep 1;true"; echo "sleep 2;false";echo "sleep 3;true") | parallel -j10 --halt-on-error 2; echo $?; (echo "sleep 1;true"; echo "sleep 2;false";echo "sleep 3;true";echo "sleep 4; non_exist") | parallel -j10 --halt 2; echo $?
@ -40,10 +41,11 @@ echo '### Test --halt -1'; (echo "sleep 1;false"; echo "sleep 2;true";echo "sl
### Test --halt -1
0
0
parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job succeeded:
parallel: This job succeeded:
sleep 2;true
parallel: Starting no more jobs. Waiting for 3 jobs to finish. This job succeeded:
sleep 2;true
parallel: Starting no more jobs. Waiting for 3 jobs to finish.
/bin/bash: non_exist: command not found
echo '**'
**
@ -62,33 +64,42 @@ echo '### Test last dying print --halt-on-error 1'; (seq 0 8;echo 0; echo 9) |
exit code 9
0
1
parallel: Starting no more jobs. Waiting for 9 jobs to finish. This job failed:
parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 1
parallel: Starting no more jobs. Waiting for 9 jobs to finish.
2
parallel: Starting no more jobs. Waiting for 8 jobs to finish. This job failed:
parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 2
parallel: Starting no more jobs. Waiting for 8 jobs to finish.
3
parallel: Starting no more jobs. Waiting for 7 jobs to finish. This job failed:
parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 3
parallel: Starting no more jobs. Waiting for 7 jobs to finish.
4
parallel: Starting no more jobs. Waiting for 6 jobs to finish. This job failed:
parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 4
parallel: Starting no more jobs. Waiting for 6 jobs to finish.
5
parallel: Starting no more jobs. Waiting for 5 jobs to finish. This job failed:
parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 5
parallel: Starting no more jobs. Waiting for 5 jobs to finish.
6
parallel: Starting no more jobs. Waiting for 4 jobs to finish. This job failed:
parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 6
parallel: Starting no more jobs. Waiting for 4 jobs to finish.
7
parallel: Starting no more jobs. Waiting for 3 jobs to finish. This job failed:
parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 7
parallel: Starting no more jobs. Waiting for 3 jobs to finish.
8
0
parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job failed:
parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 8
parallel: Starting no more jobs. Waiting for 2 jobs to finish.
9
parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed:
parallel: This job failed:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ shift 9
parallel: Starting no more jobs. Waiting for 1 jobs to finish.
echo '### Test last dying print --halt-on-error 2'; (seq 0 8;echo 0; echo 9) | parallel -j10 -kq --halt 2 perl -e 'sleep $ARGV[0];print STDERR @ARGV,"\n"; exit shift'; echo exit code $?
### Test last dying print --halt-on-error 2
exit code 1
@ -101,33 +112,42 @@ echo '### Test last dying print --halt-on-error -1'; (seq 0 8;echo 0; echo 9)
exit code 0
0
1
parallel: Starting no more jobs. Waiting for 9 jobs to finish. This job succeeded:
parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 1
parallel: Starting no more jobs. Waiting for 9 jobs to finish.
2
parallel: Starting no more jobs. Waiting for 8 jobs to finish. This job succeeded:
parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 2
parallel: Starting no more jobs. Waiting for 8 jobs to finish.
3
parallel: Starting no more jobs. Waiting for 7 jobs to finish. This job succeeded:
parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 3
parallel: Starting no more jobs. Waiting for 7 jobs to finish.
4
parallel: Starting no more jobs. Waiting for 6 jobs to finish. This job succeeded:
parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 4
parallel: Starting no more jobs. Waiting for 6 jobs to finish.
5
parallel: Starting no more jobs. Waiting for 5 jobs to finish. This job succeeded:
parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 5
parallel: Starting no more jobs. Waiting for 5 jobs to finish.
6
parallel: Starting no more jobs. Waiting for 4 jobs to finish. This job succeeded:
parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 6
parallel: Starting no more jobs. Waiting for 4 jobs to finish.
7
parallel: Starting no more jobs. Waiting for 3 jobs to finish. This job succeeded:
parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 7
parallel: Starting no more jobs. Waiting for 3 jobs to finish.
8
0
parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job succeeded:
parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 8
parallel: Starting no more jobs. Waiting for 2 jobs to finish.
9
parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job succeeded:
parallel: This job succeeded:
perl -e sleep\ \$ARGV\[0\]\;print\ STDERR\ @ARGV,\"\\n\"\;\ exit\ not\ shift 9
parallel: Starting no more jobs. Waiting for 1 jobs to finish.
echo '### Test last dying print --halt-on-error -2'; (seq 0 8;echo 0; echo 9) | parallel -j10 -kq --halt -2 perl -e 'sleep $ARGV[0];print STDERR @ARGV,"\n"; exit not shift'; echo exit code $?
### Test last dying print --halt-on-error -2
exit code 0