From 87951b34d1329f3e512dba6b7ec238abfb21932e Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Wed, 20 May 2015 21:09:33 +0200 Subject: [PATCH] parallel: --retry-failed implemented. --- doc/release_new_version | 2 + src/parallel | 68 +++++++++---- src/parallel.pod | 33 ++++++- src/parallel_design.pod | 30 +++--- src/parallel_tutorial.html | 98 ++++++++++++++++--- src/parallel_tutorial.pod | 102 +++++++++++++++++--- testsuite/tests-to-run/parallel-tutorial.sh | 2 + 7 files changed, 277 insertions(+), 58 deletions(-) diff --git a/doc/release_new_version b/doc/release_new_version index 8b1c31cf..ec9a9b8d 100644 --- a/doc/release_new_version +++ b/doc/release_new_version @@ -252,6 +252,8 @@ taxator-tk http://algbio.cs.uni-duesseldorf.de/webapps/wa-download/ (check it) * GNU Parallel was used in: Gene Set Omic Analysis (GSOA) method https://bitbucket.org/srp33/gsoa +* A Quick and Neat :) Orchestrator using GNU Parallel http://www.elsotanillo.net/2015/05/a-quick-and-neat-orchestrator-using-gnu-parallel/ + * Execute commands on multiple computers using GNU Parallel (setting up a cluster on the cheap) https://spectraldifferences.wordpress.com/2015/04/26/execute-commands-on-multiple-computers-using-gnu-parallel-setting-up-a-cluster-on-the-cheap/ * Functions and GNU parallel for effective cluster load management http://genomespot.blogspot.dk/2015/04/functions-and-gnu-parallel-for.html diff --git a/src/parallel b/src/parallel index 1dc9a6f8..d94b45c6 100755 --- a/src/parallel +++ b/src/parallel @@ -704,13 +704,14 @@ sub options_hash { "results|result|res=s" => \$opt::results, "resume" => \$opt::resume, "resume-failed|resumefailed" => \$opt::resume_failed, + "retry-failed|retryfailed" => \$opt::retry_failed, "silent" => \$opt::silent, "keep-order|keeporder|k" => \$opt::keeporder, "no-keep-order|nokeeporder|nok|no-k" => \$opt::nokeeporder, "group" => \$opt::group, "g" => \$opt::retired, "ungroup|u" => \$opt::ungroup, - "linebuffer|linebuffered|line-buffer|line-buffered" => \$opt::linebuffer, + "linebuffer|linebuffered|line-buffer|line-buffered|lb" => \$opt::linebuffer, "tmux" => \$opt::tmux, "null|0" => \$opt::0, "quote|q" => \$opt::q, @@ -1074,7 +1075,7 @@ sub parse_options { sub init_globals { # Defaults: - $Global::version = 20150515; + $Global::version = 20150516; $Global::progname = 'parallel'; $Global::infinity = 2**31; $Global::debug = 0; @@ -1340,27 +1341,59 @@ sub open_joblog { ::wait_and_exit(255); } if($opt::joblog) { - if($opt::resume || $opt::resume_failed) { + if($opt::resume || $opt::resume_failed || $opt::retry_failed) { if(open(my $joblog_fh, "<", $opt::joblog)) { # Read the joblog $append = <$joblog_fh>; # If there is a header: Open as append later my $joblog_regexp; - if($opt::resume_failed) { + if($opt::retry_failed) { # Make a regexp that only matches commands with exit+signal=0 # 4 host 1360490623.067 3.445 1023 1222 0 0 command $joblog_regexp='^(\d+)(?:\t[^\t]+){5}\t0\t0\t'; - } else { - # Just match the job number - $joblog_regexp='^(\d+)'; + my @group; + while(<$joblog_fh>) { + if(/$joblog_regexp/o) { + # This is 30% faster than set_job_already_run($1); + vec($Global::job_already_run,($1||0),1) = 1; + $group[$1-1] = "true"; + } elsif(/(\d+)\s+\S+(\s+[-0-9.]+){6}\s+(.*)$/) { + $group[$1-1] = $3 + } else { + chomp; + ::error("Format of '$opt::joblog' is wrong: $_"); + ::wait_and_exit(255); + } + } + if(@group) { + my ($outfh,$name) = ::tmpfile(SUFFIX => ".arg"); + unlink($name); + # Put args into argfile + print $outfh map { $_,$/ } @group; + seek $outfh, 0, 0; + exit_if_disk_full(); + # Set filehandle to -a + @opt::a = ($outfh); + } + # Remove $command (so -a is run) + @ARGV = (); } - while(<$joblog_fh>) { - if(/$joblog_regexp/o) { - # This is 30% faster than set_job_already_run($1); - vec($Global::job_already_run,($1||0),1) = 1; - } elsif(not /\d+\s+[^\s]+\s+([-0-9.]+\s+){6}/) { - chomp; - ::error("Format of '$opt::joblog' is wrong: $_"); - ::wait_and_exit(255); + if($opt::resume || $opt::resume_failed) { + if($opt::resume_failed) { + # Make a regexp that only matches commands with exit+signal=0 + # 4 host 1360490623.067 3.445 1023 1222 0 0 command + $joblog_regexp='^(\d+)(?:\t[^\t]+){5}\t0\t0\t'; + } else { + # Just match the job number + $joblog_regexp='^(\d+)'; + } + while(<$joblog_fh>) { + if(/$joblog_regexp/o) { + # This is 30% faster than set_job_already_run($1); + vec($Global::job_already_run,($1||0),1) = 1; + } elsif(not /\d+\s+[^\s]+\s+([-0-9.]+\s+){6}/) { + ::error("Format of '$opt::joblog' is wrong: $_"); + ::wait_and_exit(255); + } } } close $joblog_fh; @@ -3411,8 +3444,7 @@ sub tmpfile { sub tmpname { # Select a name that does not exist - # Do not create the file as that may cause problems - # if you ssh to localhost (or a shared file system) under a different name + # Do not create the file as it may be used for creating a socket (by tmux) my $name = shift; my($tmpname); if(not -w $ENV{'TMPDIR'}) { @@ -6595,7 +6627,7 @@ sub sshlogin_wrap { } } # Duplicate vars as BASH functions to include post-shellshock functions (v1+v2) - # So --env myfunc should also look for BASH_FUNC_myfunc() + # So --env myfunc should look for BASH_FUNC_myfunc() and BASH_FUNC_myfunc%% push(@vars, "PARALLEL_PID", "PARALLEL_SEQ", map { ("BASH_FUNC_$_()", "BASH_FUNC_$_%%") } @vars); # Keep only defined variables diff --git a/src/parallel.pod b/src/parallel.pod index 52e89b23..9c1bc121 100644 --- a/src/parallel.pod +++ b/src/parallel.pod @@ -84,7 +84,21 @@ If it is a Bash function you need to B the function first. To use aliases copy the full environment as described under B<--env> and use B instead of B. +If it is a Ksh function you can encode the function in a variable: + + foo() { + echo $*; + } + export fun=`typeset -f foo`; parallel 'eval "$fun";' foo ::: works + +To export all functions and make them available when running remote: + + export fun=`typeset -f`; parallel --env fun 'eval "$fun";' foo ::: works + =cut +# ssh ksh@lo 'foo() { echo $* ; }; export fun="`typeset -f`"; parallel -S ksh@lo --env fun "eval \"\$fun\";"foo ::: works' +# ssh zsh@lo 'foo() { echo $* ; }; export fun="`typeset -f`"; parallel -S zsh@lo --env fun "eval \"\$fun\";"foo ::: works' + # If it is a zsh function you will need to use this helper function # B to export and to set $PARALLEL_SHELL to bash: # @@ -970,6 +984,8 @@ Implies B<-X> unless B<-m>, B<--xargs>, or B<--pipe> is set. =item B<--line-buffer> +=item B<--lb> + Buffer output on line basis. B<--group> will keep the output together for a whole job. B<--ungroup> allows output to mixup with half a line coming from one job and half a line coming from another @@ -1118,6 +1134,9 @@ defaults to '\n'. To have no record separator use B<--recend "">. B<--files> is often used with B<--pipe>. +B<--pipe> maxes out at around 1 GB/s input, and 100 MB/s output. If +performance is important use B<--pipepart>. + See also: B<--recstart>, B<--recend>, B<--fifo>, B<--cat>, B<--pipepart>. @@ -1130,7 +1149,8 @@ B<--pipe>, but is much faster. It has a few limitations: =item Z<>* -The file must be a physical (seekable) file and must be given using B<-a> or B<::::>. +The file must be a physical (seekable) file (not a stream) and must be +given using B<-a> or B<::::>. =item Z<>* @@ -1437,6 +1457,17 @@ commands. See also B<--joblog>, B<--resume>. +=item B<--retry-failed> (alpha testing) + +Retry all failed jobs in joblog. By reading B<--joblog> GNU +B will figure out the failed jobs and run those again. + +B<--retry-failed> ignore the command and arguments: It only looks at +the joblog. + +See also B<--joblog>, B<--resume>, B<--resume-failed>. + + =item B<--retries> I If a job fails, retry it on another computer on which it has not diff --git a/src/parallel_design.pod b/src/parallel_design.pod index 5b19c4be..9e48e018 100644 --- a/src/parallel_design.pod +++ b/src/parallel_design.pod @@ -141,19 +141,29 @@ command. {unlink;rmdir;} if($bash=~s/h//) {exit$bash;} exit$csh;' "$?h" "$status" {}); -{} is really just a tmpfile. The Perl script saves the exit value, -unlinks the tmpfile, and returns the exit value - no matter if the -shell is B (using $?) or B<*csh> (using $status). +{} is set to $PARALLEL_TMP which is a tmpfile. The Perl script saves +the exit value, unlinks the tmpfile, and returns the exit value - no +matter if the shell is B (using $?) or B<*csh> (using $status). =item --fifo -(mkfifo {}; - (<> {};) & _PID=$!; cat > {}; wait $_PID; perl -e '$bash=shift; $csh=shift; for(@ARGV) -{unlink;rmdir;} if($bash=~s/h//) {exit$bash;} exit$csh;' "$?h" -"$status" {}); +perl -e '($s,$c,$f) = @ARGV; +system "mkfifo", $f; +$pid = fork || exec $s, "-c", $c; +open($o,">",$f) || die $!; +while(sysread(STDIN,$buf,32768)){ +syswrite $o, $buf; +} +close $o; +waitpid $pid,0; +unlink $f; +exit $?/256;' $shell <> $PARALLEL_TMP -B makes sure the exit value is from that PID. This makes it -incompatible with B<*csh>. The Perl script is the same as from B<--cat>. +This is an elaborate way of: mkfifo {}; run <> in the +background using $shell; copying STDIN to {}; waiting for background +to complete; remove {} and exit with the exit code from <>. + +It is made this way to be compatible with B<*csh>. =item --sshlogin I @@ -249,8 +259,6 @@ For B 17000 can be lowered to 2100. The interesting areas are title 0..1000 with (title + whole command) in 996..1127 and 9331..9636. - - =back The ordering of the wrapping is important: diff --git a/src/parallel_tutorial.html b/src/parallel_tutorial.html index 6e357bb7..d506baa2 100644 --- a/src/parallel_tutorial.html +++ b/src/parallel_tutorial.html @@ -55,6 +55,7 @@
  • Control the execution
    • Number of simultaneous jobs
    • +
    • Shuffle job order
    • Interactivity
    • A terminal for every job
    • Timing
    • @@ -93,6 +94,7 @@
    • Semaphore
    • Informational
    • @@ -1068,6 +1070,16 @@
        parallel --use-cpus-instead-of-cores -N0 sleep 1 :::: num8
      +

      Shuffle job order

      + +

      If you have many jobs (e.g. by multiple combinations of input sources), it can be handy to shuffle the jobs, so you get different values run.

      + +
        parallel --shuf echo ::: 1 2 3 ::: a b c ::: A B C
      + +

      Output:

      + +
        All combinations but different order for each run.
      +

      Interactivity

      GNU Parallel can ask the user if a command should be run using --interactive:

      @@ -1098,7 +1110,7 @@

      This will tell you to run something similar to:

      -
        tmux -S /tmp/paroRLCx.tms attach
      +
        tmux -S /tmp/tmsrPrO0 attach

      Using normal tmux keystrokes (CTRL-b n or CTRL-b p) you can cycle between windows of the running jobs. When a job is finished it will pause for 10 seconds before closing the window.

      @@ -1233,9 +1245,9 @@

      Termination

      -

      For certain jobs there is no need to continue if one of the jobs fails and has an exit code != 0. GNU Parallel will stop spawning new jobs with --halt 1:

      +

      For certain jobs there is no need to continue if one of the jobs fails and has an exit code != 0. GNU Parallel will stop spawning new jobs with --halt soon,fail=1:

      -
        parallel -j2 --halt 1 echo {}\; exit {} ::: 0 0 1 2 3
      +
        parallel -j2 --halt soon,fail=1 echo {}\; exit {} ::: 0 0 1 2 3

      Output:

      @@ -1248,9 +1260,9 @@ parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed: echo 2; exit 2 -

      With --halt 2 the running jobs will be killed immediately:

      +

      With --halt now,fail=1 the running jobs will be killed immediately:

      -
        parallel -j2 --halt 2 echo {}\; exit {} ::: 0 0 1 2 3
      +
        parallel -j2 --halt now,fail=1 echo {}\; exit {} ::: 0 0 1 2 3

      Output:

      @@ -1260,23 +1272,38 @@ parallel: This job failed: echo 1; exit 1 -

      If --halt is given a percentage this percentage of the jobs must fail (though minimum 3) before GNU Parallel stops spawning more jobs:

      +

      If --halt is given a percentage this percentage of the jobs must fail before GNU Parallel stops spawning more jobs:

      -
        parallel -j2 --halt 20% echo {}\; exit {} ::: 0 0 1 2 3 4 5 6 7
      +
        parallel -j2 --halt soon,fail=20% echo {}\; exit {} ::: 0 1 2 3 4 5 6 7 8 9

      Output:

        0
      -  0
         1
      +  parallel: This job failed:
      +  echo 1; exit 1
      +  2
      +  parallel: This job failed:
      +  echo 2; exit 2
      +  parallel: Starting no more jobs. Waiting for 1 jobs to finish.
      +  3
      +  parallel: This job failed:
      +  echo 3; exit 3
      + +

      If you are looking for success instead of failures, you can use success:

      + +
        parallel -j2 --halt soon,success=1 echo {}\; exit {} ::: 1 2 3 0 4 5 6
      + +

      Output:

      + +
        1
         2
         3
      -  4
      -  parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job failed:
      -      echo 4; exit 4
      -  5
      -  parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed:
      -      echo 5; exit 5
      + 0 + parallel: This job succeeded: + echo 0; exit 0 + parallel: Starting no more jobs. Waiting for 1 jobs to finish. + 4

      GNU Parallel can retry the command with --retries. This is useful if a command fails for unknown reasons now and then.

      @@ -1382,6 +1409,12 @@

      Output: Same as above.

      +

      Or newline:

      + +
        # This gives a \n between $SERVER1 and $SERVER2
      +  SERVERS="`echo $SERVER1; echo $SERVER2`"
      +  parallel -S "$SERVERS" echo ::: running on more hosts
      +

      The can also be read from a file (replace user@ with the user on $SERVER2):

        echo $SERVER1 > nodefile
      @@ -1401,6 +1434,24 @@
       
       
        force 4 cpus on server
      +

      Servers can be put into groups by prepending '@groupname' to the server and the group can then be selected by appending '@groupname' to the argument if using '--hostgrp'.

      + +
        parallel --hostgrp -S @grp1/$SERVER1 -S @grp2/SERVER2 echo {} ::: run_on_grp1@grp1 run_on_grp2@grp2
      + +

      Output:

      + +
        run_on_grp1
      +  run_on_grp2
      + +

      A host can be in multiple groups by separating groups with '+', and you can force GNU parallel to limit the groups on which the command can be run with '-S @groupname':

      + +
        parallel -S @grp1 -S @grp1+grp2/$SERVER1 -S @grp2/SERVER2 echo {} ::: run_on_grp1 also_grp1
      + +

      Output:

      + +
        run_on_grp1
      +  also_grp1
      +

      Transferring files

      GNU Parallel can transfer the files to be processed to the remote host. It does that using rsync.

      @@ -2039,6 +2090,25 @@ The third finished The fourth finished
      +

      Timeout

      + +

      With --semaphoretimeout you can force running the command anyway after a period (postive number) or give up (negative number):

      + +
        sem --id foo -u 'echo Slow started; sleep 5; echo Slow ended' &&
      +  sem --id foo --semaphoretimeout 1 'echo Force this running after 1 sec' &&
      +  sem --id foo --semaphoretimeout -2 'echo Give up after 1 sec'
      +  sem --id foo --wait
      + +

      Output:

      + +
        Slow started
      +  parallel: Warning: Semaphore timed out. Stealing the semaphore.
      +  Force this running after 1 sec
      +  Slow ended
      +  parallel: Warning: Semaphore timed out. Exiting.
      + +

      Note how the 'Give up' was not run.

      +

      Informational

      GNU Parallel has some options to give short information about the configuration.

      diff --git a/src/parallel_tutorial.pod b/src/parallel_tutorial.pod index 02708f20..211709e9 100644 --- a/src/parallel_tutorial.pod +++ b/src/parallel_tutorial.pod @@ -996,6 +996,18 @@ GNU Parallel can base it on the number of CPUs: parallel --use-cpus-instead-of-cores -N0 sleep 1 :::: num8 +=head2 Shuffle job order + +If you have many jobs (e.g. by multiple combinations of input +sources), it can be handy to shuffle the jobs, so you get different +values run. + + parallel --shuf echo ::: 1 2 3 ::: a b c ::: A B C + +Output: + + All combinations but different order for each run. + =head2 Interactivity GNU Parallel can ask the user if a command should be run using --interactive: @@ -1027,7 +1039,7 @@ Using tmux GNU Parallel can start a terminal for every job run: This will tell you to run something similar to: - tmux -S /tmp/paroRLCx.tms attach + tmux -S /tmp/tmsrPrO0 attach Using normal tmux keystrokes (CTRL-b n or CTRL-b p) you can cycle between windows of the running jobs. When a job is finished it will @@ -1178,9 +1190,9 @@ Note how seq 1 2 3 have been repeated because they had exit value != 0. For certain jobs there is no need to continue if one of the jobs fails and has an exit code != 0. GNU Parallel will stop spawning new jobs -with --halt 1: +with --halt soon,fail=1: - parallel -j2 --halt 1 echo {}\; exit {} ::: 0 0 1 2 3 + parallel -j2 --halt soon,fail=1 echo {}\; exit {} ::: 0 0 1 2 3 Output: @@ -1193,9 +1205,9 @@ Output: parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed: echo 2; exit 2 -With --halt 2 the running jobs will be killed immediately: +With --halt now,fail=1 the running jobs will be killed immediately: - parallel -j2 --halt 2 echo {}\; exit {} ::: 0 0 1 2 3 + parallel -j2 --halt now,fail=1 echo {}\; exit {} ::: 0 0 1 2 3 Output: @@ -1206,23 +1218,38 @@ Output: echo 1; exit 1 If --halt is given a percentage this percentage of the jobs must fail -(though minimum 3) before GNU Parallel stops spawning more jobs: +before GNU Parallel stops spawning more jobs: - parallel -j2 --halt 20% echo {}\; exit {} ::: 0 0 1 2 3 4 5 6 7 + parallel -j2 --halt soon,fail=20% echo {}\; exit {} ::: 0 1 2 3 4 5 6 7 8 9 Output: + + 0 + 1 + parallel: This job failed: + echo 1; exit 1 + 2 + parallel: This job failed: + echo 2; exit 2 + parallel: Starting no more jobs. Waiting for 1 jobs to finish. + 3 + parallel: This job failed: + echo 3; exit 3 - 0 - 0 +If you are looking for success instead of failures, you can use success: + + parallel -j2 --halt soon,success=1 echo {}\; exit {} ::: 1 2 3 0 4 5 6 + +Output: + 1 2 3 + 0 + parallel: This job succeeded: + echo 0; exit 0 + parallel: Starting no more jobs. Waiting for 1 jobs to finish. 4 - parallel: Starting no more jobs. Waiting for 2 jobs to finish. This job failed: - echo 4; exit 4 - 5 - parallel: Starting no more jobs. Waiting for 1 jobs to finish. This job failed: - echo 5; exit 5 GNU Parallel can retry the command with --retries. This is useful if a command fails for unknown reasons now and then. @@ -1332,6 +1359,12 @@ Or they can be separated by ,: Output: Same as above. +Or newline: + + # This gives a \n between $SERVER1 and $SERVER2 + SERVERS="`echo $SERVER1; echo $SERVER2`" + parallel -S "$SERVERS" echo ::: running on more hosts + The can also be read from a file (replace user@ with the user on $SERVER2): echo $SERVER1 > nodefile @@ -1352,6 +1385,28 @@ Output: force 4 cpus on server +Servers can be put into groups by prepending '@groupname' to the +server and the group can then be selected by appending '@groupname' to +the argument if using '--hostgrp'. + + parallel --hostgrp -S @grp1/$SERVER1 -S @grp2/SERVER2 echo {} ::: run_on_grp1@grp1 run_on_grp2@grp2 + +Output: + + run_on_grp1 + run_on_grp2 + +A host can be in multiple groups by separating groups with '+', and +you can force GNU B to limit the groups on which the command +can be run with '-S @groupname': + + parallel -S @grp1 -S @grp1+grp2/$SERVER1 -S @grp2/SERVER2 echo {} ::: run_on_grp1 also_grp1 + +Output: + + run_on_grp1 + also_grp1 + =head2 Transferring files GNU Parallel can transfer the files to be processed to the remote @@ -2052,6 +2107,25 @@ Output: The third finished The fourth finished +=head2 Timeout + +With --semaphoretimeout you can force running the command anyway after +a period (postive number) or give up (negative number): + + sem --id foo -u 'echo Slow started; sleep 5; echo Slow ended' && + sem --id foo --semaphoretimeout 1 'echo Force this running after 1 sec' && + sem --id foo --semaphoretimeout -2 'echo Give up after 1 sec' + sem --id foo --wait + +Output: + + Slow started + parallel: Warning: Semaphore timed out. Stealing the semaphore. + Force this running after 1 sec + Slow ended + parallel: Warning: Semaphore timed out. Exiting. + +Note how the 'Give up' was not run. =head1 Informational diff --git a/testsuite/tests-to-run/parallel-tutorial.sh b/testsuite/tests-to-run/parallel-tutorial.sh index 00c3726e..58fe91f9 100644 --- a/testsuite/tests-to-run/parallel-tutorial.sh +++ b/testsuite/tests-to-run/parallel-tutorial.sh @@ -35,6 +35,8 @@ perl -ne '$/="\n\n"; /^Output/../^[^O]\S/ and next; /^ / and print;' ../../src/ s/\d{10}.\d{3}\s+..\d+/TIMESTAMP\t9.999/g; # Version s/201\d{5}/VERSION/g; + # [123] [abc] [ABC] + s/^[123] [abc] [ABC]$/123 abc ABC/g; # Remote script s/(PARALLEL_PID\D+)\d+/${1}000000/g; # /usr/bin/time -f %e