parallel: --retry-failed implemented.

This commit is contained in:
Ole Tange 2015-05-20 21:09:33 +02:00
parent 456f63d2bc
commit 87951b34d1
7 changed files with 277 additions and 58 deletions

View file

@ -252,6 +252,8 @@ taxator-tk http://algbio.cs.uni-duesseldorf.de/webapps/wa-download/ (check it)
* GNU Parallel was used in: Gene Set Omic Analysis (GSOA) method https://bitbucket.org/srp33/gsoa
* A Quick and Neat :) Orchestrator using GNU Parallel http://www.elsotanillo.net/2015/05/a-quick-and-neat-orchestrator-using-gnu-parallel/
* Execute commands on multiple computers using GNU Parallel (setting up a cluster on the cheap) https://spectraldifferences.wordpress.com/2015/04/26/execute-commands-on-multiple-computers-using-gnu-parallel-setting-up-a-cluster-on-the-cheap/
* Functions and GNU parallel for effective cluster load management http://genomespot.blogspot.dk/2015/04/functions-and-gnu-parallel-for.html

View file

@ -704,13 +704,14 @@ sub options_hash {
"results|result|res=s" => \$opt::results,
"resume" => \$opt::resume,
"resume-failed|resumefailed" => \$opt::resume_failed,
"retry-failed|retryfailed" => \$opt::retry_failed,
"silent" => \$opt::silent,
"keep-order|keeporder|k" => \$opt::keeporder,
"no-keep-order|nokeeporder|nok|no-k" => \$opt::nokeeporder,
"group" => \$opt::group,
"g" => \$opt::retired,
"ungroup|u" => \$opt::ungroup,
"linebuffer|linebuffered|line-buffer|line-buffered" => \$opt::linebuffer,
"linebuffer|linebuffered|line-buffer|line-buffered|lb" => \$opt::linebuffer,
"tmux" => \$opt::tmux,
"null|0" => \$opt::0,
"quote|q" => \$opt::q,
@ -1074,7 +1075,7 @@ sub parse_options {
sub init_globals {
# Defaults:
$Global::version = 20150515;
$Global::version = 20150516;
$Global::progname = 'parallel';
$Global::infinity = 2**31;
$Global::debug = 0;
@ -1340,27 +1341,59 @@ sub open_joblog {
::wait_and_exit(255);
}
if($opt::joblog) {
if($opt::resume || $opt::resume_failed) {
if($opt::resume || $opt::resume_failed || $opt::retry_failed) {
if(open(my $joblog_fh, "<", $opt::joblog)) {
# Read the joblog
$append = <$joblog_fh>; # If there is a header: Open as append later
my $joblog_regexp;
if($opt::resume_failed) {
if($opt::retry_failed) {
# Make a regexp that only matches commands with exit+signal=0
# 4 host 1360490623.067 3.445 1023 1222 0 0 command
$joblog_regexp='^(\d+)(?:\t[^\t]+){5}\t0\t0\t';
} else {
# Just match the job number
$joblog_regexp='^(\d+)';
my @group;
while(<$joblog_fh>) {
if(/$joblog_regexp/o) {
# This is 30% faster than set_job_already_run($1);
vec($Global::job_already_run,($1||0),1) = 1;
$group[$1-1] = "true";
} elsif(/(\d+)\s+\S+(\s+[-0-9.]+){6}\s+(.*)$/) {
$group[$1-1] = $3
} else {
chomp;
::error("Format of '$opt::joblog' is wrong: $_");
::wait_and_exit(255);
}
}
if(@group) {
my ($outfh,$name) = ::tmpfile(SUFFIX => ".arg");
unlink($name);
# Put args into argfile
print $outfh map { $_,$/ } @group;
seek $outfh, 0, 0;
exit_if_disk_full();
# Set filehandle to -a
@opt::a = ($outfh);
}
# Remove $command (so -a is run)
@ARGV = ();
}
while(<$joblog_fh>) {
if(/$joblog_regexp/o) {
# This is 30% faster than set_job_already_run($1);
vec($Global::job_already_run,($1||0),1) = 1;
} elsif(not /\d+\s+[^\s]+\s+([-0-9.]+\s+){6}/) {
chomp;
::error("Format of '$opt::joblog' is wrong: $_");
::wait_and_exit(255);
if($opt::resume || $opt::resume_failed) {
if($opt::resume_failed) {
# Make a regexp that only matches commands with exit+signal=0
# 4 host 1360490623.067 3.445 1023 1222 0 0 command
$joblog_regexp='^(\d+)(?:\t[^\t]+){5}\t0\t0\t';
} else {
# Just match the job number
$joblog_regexp='^(\d+)';
}
while(<$joblog_fh>) {
if(/$joblog_regexp/o) {
# This is 30% faster than set_job_already_run($1);
vec($Global::job_already_run,($1||0),1) = 1;
} elsif(not /\d+\s+[^\s]+\s+([-0-9.]+\s+){6}/) {
::error("Format of '$opt::joblog' is wrong: $_");
::wait_and_exit(255);
}
}
}
close $joblog_fh;
@ -3411,8 +3444,7 @@ sub tmpfile {
sub tmpname {
# Select a name that does not exist
# Do not create the file as that may cause problems
# if you ssh to localhost (or a shared file system) under a different name
# Do not create the file as it may be used for creating a socket (by tmux)
my $name = shift;
my($tmpname);
if(not -w $ENV{'TMPDIR'}) {
@ -6595,7 +6627,7 @@ sub sshlogin_wrap {
}
}
# Duplicate vars as BASH functions to include post-shellshock functions (v1+v2)
# So --env myfunc should also look for BASH_FUNC_myfunc()
# So --env myfunc should look for BASH_FUNC_myfunc() and BASH_FUNC_myfunc%%
push(@vars, "PARALLEL_PID", "PARALLEL_SEQ",
map { ("BASH_FUNC_$_()", "BASH_FUNC_$_%%") } @vars);
# Keep only defined variables

View file

@ -84,7 +84,21 @@ If it is a Bash function you need to B<export -f> the function
first. To use aliases copy the full environment as described under
B<--env> and use B<env_parallel> instead of B<parallel>.
If it is a Ksh function you can encode the function in a variable:
foo() {
echo $*;
}
export fun=`typeset -f foo`; parallel 'eval "$fun";' foo ::: works
To export all functions and make them available when running remote:
export fun=`typeset -f`; parallel --env fun 'eval "$fun";' foo ::: works
=cut
# ssh ksh@lo 'foo() { echo $* ; }; export fun="`typeset -f`"; parallel -S ksh@lo --env fun "eval \"\$fun\";"foo ::: works'
# ssh zsh@lo 'foo() { echo $* ; }; export fun="`typeset -f`"; parallel -S zsh@lo --env fun "eval \"\$fun\";"foo ::: works'
# If it is a zsh function you will need to use this helper function
# B<exportf> to export and to set $PARALLEL_SHELL to bash:
#
@ -970,6 +984,8 @@ Implies B<-X> unless B<-m>, B<--xargs>, or B<--pipe> is set.
=item B<--line-buffer>
=item B<--lb>
Buffer output on line basis. B<--group> will keep the output together
for a whole job. B<--ungroup> allows output to mixup with half a line
coming from one job and half a line coming from another
@ -1118,6 +1134,9 @@ defaults to '\n'. To have no record separator use B<--recend "">.
B<--files> is often used with B<--pipe>.
B<--pipe> maxes out at around 1 GB/s input, and 100 MB/s output. If
performance is important use B<--pipepart>.
See also: B<--recstart>, B<--recend>, B<--fifo>, B<--cat>, B<--pipepart>.
@ -1130,7 +1149,8 @@ B<--pipe>, but is much faster. It has a few limitations:
=item Z<>*
The file must be a physical (seekable) file and must be given using B<-a> or B<::::>.
The file must be a physical (seekable) file (not a stream) and must be
given using B<-a> or B<::::>.
=item Z<>*
@ -1437,6 +1457,17 @@ commands.
See also B<--joblog>, B<--resume>.
=item B<--retry-failed> (alpha testing)
Retry all failed jobs in joblog. By reading B<--joblog> GNU
B<parallel> will figure out the failed jobs and run those again.
B<--retry-failed> ignore the command and arguments: It only looks at
the joblog.
See also B<--joblog>, B<--resume>, B<--resume-failed>.
=item B<--retries> I<n>
If a job fails, retry it on another computer on which it has not

View file

@ -141,19 +141,29 @@ command.
{unlink;rmdir;} if($bash=~s/h//) {exit$bash;} exit$csh;' "$?h"
"$status" {});
{} is really just a tmpfile. The Perl script saves the exit value,
unlinks the tmpfile, and returns the exit value - no matter if the
shell is B<bash> (using $?) or B<*csh> (using $status).
{} is set to $PARALLEL_TMP which is a tmpfile. The Perl script saves
the exit value, unlinks the tmpfile, and returns the exit value - no
matter if the shell is B<bash> (using $?) or B<*csh> (using $status).
=item --fifo
(mkfifo {};
(<<input>> {};) & _PID=$!; cat > {}; wait $_PID; perl -e '$bash=shift; $csh=shift; for(@ARGV)
{unlink;rmdir;} if($bash=~s/h//) {exit$bash;} exit$csh;' "$?h"
"$status" {});
perl -e '($s,$c,$f) = @ARGV;
system "mkfifo", $f;
$pid = fork || exec $s, "-c", $c;
open($o,">",$f) || die $!;
while(sysread(STDIN,$buf,32768)){
syswrite $o, $buf;
}
close $o;
waitpid $pid,0;
unlink $f;
exit $?/256;' $shell <<input>> $PARALLEL_TMP
B<wait $_PID> makes sure the exit value is from that PID. This makes it
incompatible with B<*csh>. The Perl script is the same as from B<--cat>.
This is an elaborate way of: mkfifo {}; run <<input>> in the
background using $shell; copying STDIN to {}; waiting for background
to complete; remove {} and exit with the exit code from <<input>>.
It is made this way to be compatible with B<*csh>.
=item --sshlogin I<sln>
@ -249,8 +259,6 @@ For B<tmux 1.8> 17000 can be lowered to 2100.
The interesting areas are title 0..1000 with (title + whole command)
in 996..1127 and 9331..9636.
=back
The ordering of the wrapping is important:

View file

@ -55,6 +55,7 @@
<li><a href="#Control-the-execution">Control the execution</a>
<ul>
<li><a href="#Number-of-simultaneous-jobs">Number of simultaneous jobs</a></li>
<li><a href="#Shuffle-job-order">Shuffle job order</a></li>
<li><a href="#Interactivity">Interactivity</a></li>
<li><a href="#A-terminal-for-every-job">A terminal for every job</a></li>
<li><a href="#Timing">Timing</a></li>
@ -93,6 +94,7 @@
<li><a href="#Semaphore">Semaphore</a>
<ul>
<li><a href="#Counting-semaphore">Counting semaphore</a></li>
<li><a href="#Timeout">Timeout</a></li>
</ul>
</li>
<li><a href="#Informational">Informational</a></li>
@ -1068,6 +1070,16 @@
<pre><code> parallel --use-cpus-instead-of-cores -N0 sleep 1 :::: num8</code></pre>
<h2 id="Shuffle-job-order">Shuffle job order</h2>
<p>If you have many jobs (e.g. by multiple combinations of input sources), it can be handy to shuffle the jobs, so you get different values run.</p>
<pre><code> parallel --shuf echo ::: 1 2 3 ::: a b c ::: A B C</code></pre>
<p>Output:</p>
<pre><code> All combinations but different order for each run.</code></pre>
<h2 id="Interactivity">Interactivity</h2>
<p>GNU Parallel can ask the user if a command should be run using --interactive:</p>
@ -1098,7 +1110,7 @@
<p>This will tell you to run something similar to:</p>
<pre><code> tmux -S /tmp/paroRLCx.tms attach</code></pre>
<pre><code> tmux -S /tmp/tmsrPrO0 attach</code></pre>
<p>Using normal tmux keystrokes (CTRL-b n or CTRL-b p) you can cycle between windows of the running jobs. When a job is finished it will pause for 10 seconds before closing the window.</p>
@ -1233,9 +1245,9 @@
<h2 id="Termination">Termination</h2>
<p>For certain jobs there is no need to continue if one of the jobs fails and has an exit code != 0. GNU Parallel will stop spawning new jobs with --halt 1:</p>
<p>For certain jobs there is no need to continue if one of the jobs fails and has an exit code != 0. GNU Parallel will stop spawning new jobs with --halt soon,fail=1:</p>
<pre><code> parallel -j2 --halt 1 echo {}\; exit {} ::: 0 0 1 2 3</code></pre>
<pre><code> parallel -j2 --halt soon,fail=1 echo {}\; exit {} ::: 0 0 1 2 3</code></pre>
<p>Output:</p>
@ -1248,9 +1260,9 @@
parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed:
echo 2; exit 2</code></pre>
<p>With --halt 2 the running jobs will be killed immediately:</p>
<p>With --halt now,fail=1 the running jobs will be killed immediately:</p>
<pre><code> parallel -j2 --halt 2 echo {}\; exit {} ::: 0 0 1 2 3</code></pre>
<pre><code> parallel -j2 --halt now,fail=1 echo {}\; exit {} ::: 0 0 1 2 3</code></pre>
<p>Output:</p>
@ -1260,23 +1272,38 @@
parallel: This job failed:
echo 1; exit 1</code></pre>
<p>If --halt is given a percentage this percentage of the jobs must fail (though minimum 3) before GNU Parallel stops spawning more jobs:</p>
<p>If --halt is given a percentage this percentage of the jobs must fail before GNU Parallel stops spawning more jobs:</p>
<pre><code> parallel -j2 --halt 20% echo {}\; exit {} ::: 0 0 1 2 3 4 5 6 7</code></pre>
<pre><code> parallel -j2 --halt soon,fail=20% echo {}\; exit {} ::: 0 1 2 3 4 5 6 7 8 9</code></pre>
<p>Output:</p>
<pre><code> 0
0
1
parallel: This job failed:
echo 1; exit 1
2
parallel: This job failed:
echo 2; exit 2
parallel: Starting no more jobs. Waiting for 1 jobs to finish.
3
parallel: This job failed:
echo 3; exit 3</code></pre>
<p>If you are looking for success instead of failures, you can use success:</p>
<pre><code> parallel -j2 --halt soon,success=1 echo {}\; exit {} ::: 1 2 3 0 4 5 6</code></pre>
<p>Output:</p>
<pre><code> 1
2
3
4
parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job failed:
echo 4; exit 4
5
parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed:
echo 5; exit 5</code></pre>
0
parallel: This job succeeded:
echo 0; exit 0
parallel: Starting no more jobs. Waiting for 1 jobs to finish.
4</code></pre>
<p>GNU Parallel can retry the command with --retries. This is useful if a command fails for unknown reasons now and then.</p>
@ -1382,6 +1409,12 @@
<p>Output: Same as above.</p>
<p>Or newline:</p>
<pre><code> # This gives a \n between $SERVER1 and $SERVER2
SERVERS=&quot;`echo $SERVER1; echo $SERVER2`&quot;
parallel -S &quot;$SERVERS&quot; echo ::: running on more hosts</code></pre>
<p>The can also be read from a file (replace user@ with the user on $SERVER2):</p>
<pre><code> echo $SERVER1 &gt; nodefile
@ -1401,6 +1434,24 @@
<pre><code> force 4 cpus on server</code></pre>
<p>Servers can be put into groups by prepending &#39;@groupname&#39; to the server and the group can then be selected by appending &#39;@groupname&#39; to the argument if using &#39;--hostgrp&#39;.</p>
<pre><code> parallel --hostgrp -S @grp1/$SERVER1 -S @grp2/SERVER2 echo {} ::: run_on_grp1@grp1 run_on_grp2@grp2</code></pre>
<p>Output:</p>
<pre><code> run_on_grp1
run_on_grp2</code></pre>
<p>A host can be in multiple groups by separating groups with &#39;+&#39;, and you can force GNU <b>parallel</b> to limit the groups on which the command can be run with &#39;-S @groupname&#39;:</p>
<pre><code> parallel -S @grp1 -S @grp1+grp2/$SERVER1 -S @grp2/SERVER2 echo {} ::: run_on_grp1 also_grp1</code></pre>
<p>Output:</p>
<pre><code> run_on_grp1
also_grp1</code></pre>
<h2 id="Transferring-files">Transferring files</h2>
<p>GNU Parallel can transfer the files to be processed to the remote host. It does that using rsync.</p>
@ -2039,6 +2090,25 @@
The third finished
The fourth finished</code></pre>
<h2 id="Timeout">Timeout</h2>
<p>With --semaphoretimeout you can force running the command anyway after a period (postive number) or give up (negative number):</p>
<pre><code> sem --id foo -u &#39;echo Slow started; sleep 5; echo Slow ended&#39; &amp;&amp;
sem --id foo --semaphoretimeout 1 &#39;echo Force this running after 1 sec&#39; &amp;&amp;
sem --id foo --semaphoretimeout -2 &#39;echo Give up after 1 sec&#39;
sem --id foo --wait</code></pre>
<p>Output:</p>
<pre><code> Slow started
parallel: Warning: Semaphore timed out. Stealing the semaphore.
Force this running after 1 sec
Slow ended
parallel: Warning: Semaphore timed out. Exiting.</code></pre>
<p>Note how the &#39;Give up&#39; was not run.</p>
<h1 id="Informational">Informational</h1>
<p>GNU Parallel has some options to give short information about the configuration.</p>

View file

@ -996,6 +996,18 @@ GNU Parallel can base it on the number of CPUs:
parallel --use-cpus-instead-of-cores -N0 sleep 1 :::: num8
=head2 Shuffle job order
If you have many jobs (e.g. by multiple combinations of input
sources), it can be handy to shuffle the jobs, so you get different
values run.
parallel --shuf echo ::: 1 2 3 ::: a b c ::: A B C
Output:
All combinations but different order for each run.
=head2 Interactivity
GNU Parallel can ask the user if a command should be run using --interactive:
@ -1027,7 +1039,7 @@ Using tmux GNU Parallel can start a terminal for every job run:
This will tell you to run something similar to:
tmux -S /tmp/paroRLCx.tms attach
tmux -S /tmp/tmsrPrO0 attach
Using normal tmux keystrokes (CTRL-b n or CTRL-b p) you can cycle
between windows of the running jobs. When a job is finished it will
@ -1178,9 +1190,9 @@ Note how seq 1 2 3 have been repeated because they had exit value != 0.
For certain jobs there is no need to continue if one of the jobs fails
and has an exit code != 0. GNU Parallel will stop spawning new jobs
with --halt 1:
with --halt soon,fail=1:
parallel -j2 --halt 1 echo {}\; exit {} ::: 0 0 1 2 3
parallel -j2 --halt soon,fail=1 echo {}\; exit {} ::: 0 0 1 2 3
Output:
@ -1193,9 +1205,9 @@ Output:
parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed:
echo 2; exit 2
With --halt 2 the running jobs will be killed immediately:
With --halt now,fail=1 the running jobs will be killed immediately:
parallel -j2 --halt 2 echo {}\; exit {} ::: 0 0 1 2 3
parallel -j2 --halt now,fail=1 echo {}\; exit {} ::: 0 0 1 2 3
Output:
@ -1206,23 +1218,38 @@ Output:
echo 1; exit 1
If --halt is given a percentage this percentage of the jobs must fail
(though minimum 3) before GNU Parallel stops spawning more jobs:
before GNU Parallel stops spawning more jobs:
parallel -j2 --halt 20% echo {}\; exit {} ::: 0 0 1 2 3 4 5 6 7
parallel -j2 --halt soon,fail=20% echo {}\; exit {} ::: 0 1 2 3 4 5 6 7 8 9
Output:
0
1
parallel: This job failed:
echo 1; exit 1
2
parallel: This job failed:
echo 2; exit 2
parallel: Starting no more jobs. Waiting for 1 jobs to finish.
3
parallel: This job failed:
echo 3; exit 3
0
0
If you are looking for success instead of failures, you can use success:
parallel -j2 --halt soon,success=1 echo {}\; exit {} ::: 1 2 3 0 4 5 6
Output:
1
2
3
0
parallel: This job succeeded:
echo 0; exit 0
parallel: Starting no more jobs. Waiting for 1 jobs to finish.
4
parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job failed:
echo 4; exit 4
5
parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed:
echo 5; exit 5
GNU Parallel can retry the command with --retries. This is useful if a
command fails for unknown reasons now and then.
@ -1332,6 +1359,12 @@ Or they can be separated by ,:
Output: Same as above.
Or newline:
# This gives a \n between $SERVER1 and $SERVER2
SERVERS="`echo $SERVER1; echo $SERVER2`"
parallel -S "$SERVERS" echo ::: running on more hosts
The can also be read from a file (replace user@ with the user on $SERVER2):
echo $SERVER1 > nodefile
@ -1352,6 +1385,28 @@ Output:
force 4 cpus on server
Servers can be put into groups by prepending '@groupname' to the
server and the group can then be selected by appending '@groupname' to
the argument if using '--hostgrp'.
parallel --hostgrp -S @grp1/$SERVER1 -S @grp2/SERVER2 echo {} ::: run_on_grp1@grp1 run_on_grp2@grp2
Output:
run_on_grp1
run_on_grp2
A host can be in multiple groups by separating groups with '+', and
you can force GNU B<parallel> to limit the groups on which the command
can be run with '-S @groupname':
parallel -S @grp1 -S @grp1+grp2/$SERVER1 -S @grp2/SERVER2 echo {} ::: run_on_grp1 also_grp1
Output:
run_on_grp1
also_grp1
=head2 Transferring files
GNU Parallel can transfer the files to be processed to the remote
@ -2052,6 +2107,25 @@ Output:
The third finished
The fourth finished
=head2 Timeout
With --semaphoretimeout you can force running the command anyway after
a period (postive number) or give up (negative number):
sem --id foo -u 'echo Slow started; sleep 5; echo Slow ended' &&
sem --id foo --semaphoretimeout 1 'echo Force this running after 1 sec' &&
sem --id foo --semaphoretimeout -2 'echo Give up after 1 sec'
sem --id foo --wait
Output:
Slow started
parallel: Warning: Semaphore timed out. Stealing the semaphore.
Force this running after 1 sec
Slow ended
parallel: Warning: Semaphore timed out. Exiting.
Note how the 'Give up' was not run.
=head1 Informational

View file

@ -35,6 +35,8 @@ perl -ne '$/="\n\n"; /^Output/../^[^O]\S/ and next; /^ / and print;' ../../src/
s/\d{10}.\d{3}\s+..\d+/TIMESTAMP\t9.999/g;
# Version
s/201\d{5}/VERSION/g;
# [123] [abc] [ABC]
s/^[123] [abc] [ABC]$/123 abc ABC/g;
# Remote script
s/(PARALLEL_PID\D+)\d+/${1}000000/g;
# /usr/bin/time -f %e