From 822f0d4d736d06dd70f404ce12ea6df1d8cd092f Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Sun, 30 Jun 2013 18:11:36 +0200 Subject: [PATCH] src/parallel: --filter-host is now much faster. --- doc/release_new_version | 26 ++++++------------------- src/parallel | 43 +++++++++++++++++++++++++++-------------- src/parallel.pod | 31 ++++++++++++++++++++++++++--- src/parallel.texi | 29 +++++++++++++++++++++++++-- 4 files changed, 90 insertions(+), 39 deletions(-) diff --git a/doc/release_new_version b/doc/release_new_version index 30fec280..9f12d160 100644 --- a/doc/release_new_version +++ b/doc/release_new_version @@ -163,7 +163,7 @@ http://freshmeat.net/projects/parallel/releases/new == Update Diaspora Twitter == New release of #GNU Parallel pi.dk/0 New in this release pi.dk/2 See the intro videos pi.dk/1 -10 seconds installation: wget -O - pi.dk/3|sh +10 secs installation: wget -O - pi.dk/3|sh [x] Twitter Aspect: Public @@ -171,7 +171,7 @@ Aspect: Public == Send announce == http://groups.google.com/group/comp.unix.shell/post -Newsgroups: comp.unix.shell,comp.unix.admin +http://groups.google.com/group/comp.unix.admin/post https://lists.gnu.org/mailman/admindb/bug-parallel https://lists.gnu.org/mailman/admindb/parallel @@ -195,32 +195,18 @@ cc:Sandro Cazzaniga , Ryoichiro Suzuki , Jesse Alama -Subject: GNU Parallel 20130622 ('Snowden') released +Subject: GNU Parallel 20130722 ('') released -GNU Parallel 20130622 ('Snowden') has been released. It is +GNU Parallel 20130722 ('') has been released. It is available for download at: http://ftp.gnu.org/gnu/parallel/ Very few changes so this can be considered a stable release. New in this release: -* --xapply now recycles arguments if an input source has more - arguments than others. +* http://www.brunokim.com.br/blog/?p=18 -* The sleep time between jobs is now both increased and decreased - exponentially. - -* 10 seconds installation check the signature using GnuPG if GnuPG is - installed. - -* Developer job asking for GNU Parallel expertise. - http://careers.stackoverflow.com/jobs/35562/developer-big-data-geo-and-web-climate-central - -* A small utility program to run youtube-dl in parallel. - https://github.com/dlh/youtube-dl-parallel - -* Parallelizing Freesurfer: - http://blog.cogneurostats.com/?p=148 +* http://www.open-open.com/news/view/371301 * Bug fixes and man page updates. diff --git a/src/parallel b/src/parallel index 0685cc50..bf8d86b4 100755 --- a/src/parallel +++ b/src/parallel @@ -99,21 +99,36 @@ if($opt::header and not $opt::pipe) { # Parallel check for all hosts are up #if(not $opt::plain and (@opt::sshlogin or @opt::sshloginfile)) { if($opt::filter_hosts and (@opt::sshlogin or @opt::sshloginfile)) { - my @S = map { "-S " . ::shell_quote_scalar($_) } @opt::sshlogin; - my @slf = map { "--slf " . ::shell_quote_scalar($_) } @opt::sshloginfile; - my $cmd = "$0 --plain --tag --joblog - -k --onall @S @slf " . - "::: ". - "'parallel --number-of-cores ' ". - "'parallel --number-of-cpus' ". - "'parallel --max-line-length-allowed' ". - "'true' "; + my(@cores, @cpus, @maxline, @echo); + while (my ($host, $sshlogin) = each %Global::host) { + # The 'true' is used to get the $host out later + my $sshcmd = "true $host;" . $sshlogin->sshcommand()." ".$sshlogin->serverlogin(); + push(@cores, $host."\t".$sshcmd." parallel --number-of-cores\n"); + push(@cpus, $host."\t".$sshcmd." parallel --number-of-cpus\n"); + push(@maxline, $host."\t".$sshcmd." parallel --max-line-length-allowed\n"); + # 'echo' is used to get the best possible value for an ssh login time + push(@echo, $host."\t".$sshcmd." echo\n"); + } + my ($fh, $tmpfile) = ::tempfile(SUFFIX => ".ssh"); + print $fh @cores, @cpus, @maxline, @echo; + close $fh; + my $cmd = "cat $tmpfile | $0 -j0 -s 1000 --joblog - --plain --tag --tagstring {1} --colsep '\t' -k eval {2}"; ::debug($cmd."\n"); open(my $host_fh, "-|", $cmd) || ::die_bug("parallel host check: $cmd"); - my (%ncores, %ncpus, %time_to_login, %maxlen); + my (%ncores, %ncpus, %time_to_login, %maxlen, %echo); while(<$host_fh>) { my @col = split /\t/, $_; if(defined $col[6]) { # This is a line from --joblog + # 2 : 1372607672.654 0.675 0 0 0 0 eval true\ m\;ssh\ m\ parallel\ --number-of-cores + if($col[0] eq "Seq" and $col[1] eq "Host" and + $col[2] eq "Starttime" and $col[3] eq "Runtime") { + # Header => skip + next; + } + # Get server from: eval true server\; + $col[8] =~ /eval true..([^;]+).;/ or ::die_bug("col8 does not contain host: $col[8]"); + my $host = $1; if($col[6] eq "255") { # signal == 255: ssh failed # Remove sshlogin @@ -126,14 +141,11 @@ if($opt::filter_hosts and (@opt::sshlogin or @opt::sshloginfile)) { $ncores{$col[1]} = 1; $ncpus{$col[1]} = 1; $maxlen{$col[1]} = Limits::Command::max_length(); - } elsif($col[0] =~ /^\d+$/ and $Global::host{$col[1]}) { + } elsif($col[0] =~ /^\d+$/ and $Global::host{$host}) { # 1 server 1338156112.05 0.303 0 0 0 0 # parallel --number-of-cores ; parallel --number-of-cpus # Remember how log it took to log in - $time_to_login{$col[1]} = ::min($time_to_login{$col[1]},$col[3]); - } elsif($col[0] eq "Seq" and $col[1] eq "Host" and - $col[2] eq "Starttime" and $col[3] eq "Runtime") { - # skip + $time_to_login{$host} = ::min($time_to_login{$host},$col[3]); } else { ::die_bug("host check unmatched long jobline: $_"); } @@ -149,6 +161,8 @@ if($opt::filter_hosts and (@opt::sshlogin or @opt::sshloginfile)) { $ncpus{$col[0]} = $col[1]; } elsif(not $maxlen{$col[0]}) { $maxlen{$col[0]} = $col[1]; + } elsif(not $echo{$col[0]}) { + $echo{$col[0]} = $col[1]; } else { ::die_bug("host check too many col0: $_"); } @@ -157,6 +171,7 @@ if($opt::filter_hosts and (@opt::sshlogin or @opt::sshloginfile)) { } } close $host_fh; + unlink $tmpfile; while (my ($sshlogin, $obj) = each %Global::host) { $ncpus{$sshlogin} or ::die_bug("ncpus missing: ".$obj->serverlogin()); $ncores{$sshlogin} or ::die_bug("ncores missing: ".$obj->serverlogin()); diff --git a/src/parallel.pod b/src/parallel.pod index fb2fbda0..19499485 100644 --- a/src/parallel.pod +++ b/src/parallel.pod @@ -68,8 +68,8 @@ B. If I is not given GNU B will behave similar to B. The I must be an executable, a script, a composed command, or -a function. If it is a function you need to B the function -first. An alias will, however, not work (see why +a function. If it is a Bash function you need to B the +function first. An alias will, however, not work (see why http://www.perlmonks.org/index.pl?node_id=484296). @@ -1745,6 +1745,31 @@ Find the files in a list that do not exist B + +=head1 EXAMPLE: Calling Bash functions + +If the composed command is longer than a line, it becomes hard to +read. In Bash you can use functions. Just remember to B the +function. + + doit() { + echo Doing it for $1 + sleep 2 + echo Done with $1 + } + export -f doit + parallel doit ::: 1 2 3 + + doubleit() { + echo Doing it for $1 $2 + sleep 2 + echo Done with $1 $2 + } + export -f doubleit + parallel doubleit ::: 1 2 3 ::: a b + + + =head1 EXAMPLE: Removing file extension when processing files When processing files removing the file extension using B<{.}> is @@ -1782,7 +1807,7 @@ B 100% will make sense. @item @strong{--tollef} (obsolete - will be retired 20140222) @@ -1866,6 +1866,31 @@ Find the files in a list that do not exist @strong{cat file_list | parallel 'if [ ! -e @{@} ] ; then echo @{@}; fi'} +@chapter EXAMPLE: Calling Bash functions +@anchor{EXAMPLE: Calling Bash functions} + +If the composed command is longer than a line, it becomes hard to +read. In Bash you can use functions. Just remember to @strong{export -f} the +function. + +@verbatim + doit() { + echo Doing it for $1 + sleep 2 + echo Done with $1 + } + export -f doit + parallel doit ::: 1 2 3 + + doubleit() { + echo Doing it for $1 $2 + sleep 2 + echo Done with $1 $2 + } + export -f doubleit + parallel doubleit ::: 1 2 3 ::: a b +@end verbatim + @chapter EXAMPLE: Removing file extension when processing files @anchor{EXAMPLE: Removing file extension when processing files} @@ -1904,7 +1929,7 @@ foo) you can do: Let us assume a website stores images like: @verbatim - http://www.example.com/path/to/YYYYMMDD_##.jpg + http://www.example.com/path/to/YYYYMMDD_##.jpg @end verbatim where YYYYMMDD is the date and ## is the number 01-10. This will