From d5e29c73a254dcd1aadbf7c05bf45dd31709a18d Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Tue, 17 Feb 2009 04:58:13 +0100 Subject: [PATCH] Preparations for option -x --- parallel | 180 ++++++++++++++++++++------------ parallel.1 | 87 +++++++++------ unittest/actual-results/test02 | 12 +-- unittest/actual-results/test05 | 2 +- unittest/actual-results/test08 | 2 + unittest/input-files/test08/a | 3 + unittest/input-files/test08/b | 3 + unittest/input-files/test08/c | 3 + unittest/input-files/test08/d | 4 + unittest/tests-to-run/test08.sh | 5 + unittest/wanted-results/test02 | 12 +-- unittest/wanted-results/test05 | 2 +- unittest/wanted-results/test08 | 2 + 13 files changed, 206 insertions(+), 111 deletions(-) create mode 100644 unittest/actual-results/test08 create mode 100644 unittest/input-files/test08/a create mode 100644 unittest/input-files/test08/b create mode 100644 unittest/input-files/test08/c create mode 100644 unittest/input-files/test08/d create mode 100644 unittest/tests-to-run/test08.sh create mode 100644 unittest/wanted-results/test08 diff --git a/parallel b/parallel index 845b6604..37063965 100755 --- a/parallel +++ b/parallel @@ -93,6 +93,10 @@ output from different commands to be mixed. Can be reversed with B<-g>. Verbose. Print the job to be run. Can be reversed with B<-s>. +=item B<-x> + +xargs. Insert as many arguments as the command line length permits. + =back =head1 EXAMPLE 1: Working as cat | sh. Ressource inexpensive jobs and evaluation @@ -185,6 +189,10 @@ files in each directory: B +To put the output in a file called .dir: + +B>B< {}.dir'> + =head1 QUOTING For more advanced use quoting may be an issue. The following will @@ -208,6 +216,12 @@ B>B<{}.diff"> because > needs to be interpreted by the shell. +If you get errors like: + +B + +then you might try using B<-q>. + B: To avoid dealing with the quoting problems it may be easier just to write a small script and have B call that script. @@ -240,19 +254,24 @@ cat | tr '\n' '\0' | xargs -0 -n1 I B can run a given number of jobs in parallel, but has no support for running no_of_cpus jobs in parallel. -B has no support for grouping the output, so output may run -together, so the first half of a line is from one process and the last -half of the line is from another process. +B has no support for grouping the output, therefore output may +run together, e.g. the first half of a line is from one process and +the last half of the line is from another process. If no command is given to B it defaults to /bin/echo. So the B functionality is missing. +Quoting in B works like B<-q> in B. Doing B> B<{}.wc"> using B seems to be impossible. =head1 BUGS Filenames beginning with '-' can cause some commands to give unexpected results, as it will often be interpreted as an option. +If you have a lot of filehandles, then computing the max no +takes a long time. + =head1 REPORTING BUGS @@ -261,7 +280,8 @@ Report bugs to . =head1 AUTHOR -Copyright (C) 2007 Ole Tange, http://ole.tange.dk +Copyright (C) 2007-10-18 Ole Tange, http://ole.tange.dk +Copyright (C) 2008-2009 Ole Tange, http://ole.tange.dk =head1 LICENSE @@ -294,6 +314,7 @@ B(1), B(1) =cut + use IPC::Open3; use Symbol qw(gensym); use IO::File; @@ -316,6 +337,7 @@ $/="\n"; $Global::debug = (defined $::opt_d); if(defined $::opt_j) { $processes = compute_number_of_processes($::opt_j); } +if(defined $::opt_x) { $Global::xargs = 1; } if(defined $::opt_v) { $Global::verbose = 1; } if(defined $::opt_s) { $Global::verbose = 0; } if(defined $::opt_g) { $Global::grouped = 1; } @@ -326,17 +348,20 @@ if(defined $::opt_0) { $/ = "\0"; } if(defined $::opt_q) { $Global::quoting = 1; } if(@ARGV) { if($Global::quoting) { - $command = join(" ", shell_quote(@ARGV)); + $Global::command = join(" ", shell_quote(@ARGV)); } else { - $command = join(" ", @ARGV); + $Global::command = join(" ", @ARGV); } } init_run_jobs(); -while($args = ) { - my $cmd_line = generate_command_line($command, $args); - queue_job($processes, $cmd_line); +DoNotReap(); +while($Global::running_jobs < $processes + and + start_another_job()) { + # skip } +ReapIfNeeded(); drain_job_queue(); # @@ -345,12 +370,15 @@ drain_job_queue(); sub generate_command_line { my $command = shift; - my $line = shift; - chomp($line); - my ($job_line,$arg); + my @args = @_; + chomp(@args); if($Global::input_is_filename) { - ($line) = (shell_quote($line)); + for my $arg (@args) { + ($arg) = (shell_quote($arg)); + } } + my $line = join(" ",@args); + my ($job_line,$arg); if($command) { $job_line = $command; $arg = $line; @@ -385,9 +413,13 @@ sub shell_quote { } # -# Number of processes +# Number of processes, filehandles, max length of command line # +sub max_length_of_command_line { + +} + sub compute_number_of_processes { my $opt_j = shift; my $processes = 0; @@ -416,13 +448,16 @@ sub compute_number_of_processes { $processes = 1; } } - my $free_handles = compute_no_of_free_filehandles(); - if($processes > $free_handles / 2) { - # Every simultaneous process uses 2 filehandles when grouping - # perl uses 7 for something? - $processes = int (($free_handles -7) / 2); + # Every simultaneous process uses 2 filehandles when grouping + # perl uses 7 for something? + # parallel uses 1 for memory_usage + my $file_handles_needed = $processes*2+7+1; + my $free_handles = compute_no_of_free_filehandles($file_handles_needed); + if($file_handles_needed > $free_handles) { + $processes = int (($free_handles -7 -1) / 2); print STDERR ("Warning: Only enough filehandles to run ", - $processes, " jobs in parallel\n"); + $processes, " jobs in parallel. ", + "Raising ulimit -n may help\n"); } debug("Computed processes: ".(int $processes)."\n"); @@ -430,11 +465,12 @@ sub compute_number_of_processes { } sub compute_no_of_free_filehandles { + my $needed = shift; my $i=1; my %fh; - while(open($fh{$i}," $needed and last } for (keys %fh) { close $fh{$_} } - debug("Number of free handles: ".."\n"); + debug("Number of free handles: ".$i."\n"); return $i; } @@ -491,42 +527,21 @@ sub init_run_jobs { $Global::running_jobs=0; } -sub queue_job { - my $processes = shift; - my $command = shift; - - if($command =~ /\S/) { - # Only run commands - not empty lines - push @Global::command, $command; - } - if($Global::running_jobs < $processes) { - debug("queing $command\n"); - start_another_job(); - debug("queued $command\n"); - } +sub next_command_line { + my $cmd_line; + do { + if(eof STDIN) { + return undef; + } + my $args = ; + $cmd_line = generate_command_line($Global::command, $args); + } while ($cmd_line =~ /^\s*$/); # Skip empty lines + return $cmd_line; } sub drain_job_queue { - while(1) { - if($Global::running_jobs == 0) { last } - debug("jobs running: $Global::running_jobs\n"); - sleep 1; - } -} - -sub run_jobs { - # Run the jobs in @Global::command with $processes simultaneously - my $processes = shift; - - for my $i (1..$processes) { - # Start $processes jobs. - # When a job dies, the handler will take care of starting another - start_another_job(); - } - - while(1) { - if($Global::running_jobs == 0) { last } - debug("jobs running: $Global::running_jobs\n"); + while($Global::running_jobs > 0) { + debug("jobs running: $Global::running_jobs Memory usage:".my_memory_usage()."\n"); sleep 1; } } @@ -535,12 +550,13 @@ sub start_another_job { # Grab a job from @Global::command, start it # and remember the pid, the STDOUT and the STDERR handles # If no more jobs: do nothing - my $command = shift @Global::command; + my $command = next_command_line(); if(defined $command) { - DoNotReap(); my %jobinfo = start_job($command); $Global::running{$jobinfo{"pid"}} = \%jobinfo; - ReapIfNeeded(); + return 1; + } else { + return 0; } } @@ -567,9 +583,9 @@ sub start_job { print STDOUT $command,"\n"; } $Global::running_jobs++; - debug("starting: $command\n"); + debug("$Global::running_jobs processes. Starting: $command\n"); $pid = open3(gensym, ">&STDOUT", ">&STDERR", $command) || - die("open3 failed. Report a bug\n"); + die("open3 failed. Report a bug to \n"); debug("started: $command\n"); open STDOUT, ">&", $Global::original_stdout or die "Can't dup \$oldout: $!"; open STDERR, ">&", $Global::original_stderr or die "Can't dup \$oldout: $!"; @@ -593,7 +609,7 @@ sub print_job { my $err = $fhs->{err}; my $command = $fhs->{command}; - debug(">>job\n"); + debug(">>joboutput $command\n"); if($Global::verbose and $Global::grouped) { print STDOUT $command,"\n"; # If STDOUT and STDERR is merged, we want the command to be printed first @@ -605,10 +621,10 @@ sub print_job { while( <$err> ) { print STDERR "ERR: $_" } while( <$out> ) { print STDOUT "OUT: $_" } } else { - while( <$err> ) { print STDERR } - while( <$out> ) { print STDOUT } + print STDERR <$err>; + print STDOUT <$out>; } - debug("< 0) { print_job($Global::running{$stiff}); delete $Global::running{$stiff}; @@ -651,6 +668,8 @@ sub Reaper { start_another_job(); } ReapIfNeeded(); + debug("Reaper exit $Global::reaperlevel\n"); + $Global::reaperlevel--; } # @@ -680,6 +699,33 @@ sub debug { } } +sub my_memory_usage { + use strict; + use FileHandle; + + my $pid = $$; + my $fh = FileHandle->new("; + chomp $data; + $fh->close; + + my @procinfo = split(/\s+/,$data); + + return $procinfo[22]; +} + +sub my_size { + my @size_this = (@_); + eval "use Devel::Size qw(size total_size)"; + if ($@) { + return -1; + } else { + return total_size(@_); + } +} + + sub my_dump { my @dump_this = (@_); eval "use Data::Dump qw(dump);"; @@ -703,7 +749,7 @@ sub my_dump { # Keep perl -w happy $main::opt_u = $main::opt_c = $main::opt_f = $main::opt_q = $main::opt_0 = $main::opt_s = $main::opt_v = $main::opt_g = -$main::opt_j = $main::opt_d=1; - +$main::opt_j = $main::opt_d = $main::opt_x =1; +$Global::xargs = 1; diff --git a/parallel.1 b/parallel.1 index 001b6d02..53ebda1f 100644 --- a/parallel.1 +++ b/parallel.1 @@ -1,15 +1,7 @@ -.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 +.\" Automatically generated by Pod::Man 2.18 (Pod::Simple 3.05) .\" .\" Standard preamble: .\" ======================================================================== -.de Sh \" Subsection heading -.br -.if t .Sp -.ne 5 -.PP -\fB\\$1\fR -.PP -.. .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp @@ -48,22 +40,25 @@ . ds R" '' 'br\} .\" +.\" Escape single quotes in literal strings from groff's Unicode transform. +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" .\" If the F register is turned on, we'll generate index entries on stderr for -.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index +.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. -.if \nF \{\ +.ie \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . nr % 0 . rr F .\} -.\" -.\" For nroff, turn off justification. Always turn off hyphenation; it makes -.\" way too many mistakes in technical documents. -.hy 0 -.if n .na +.el \{\ +. de IX +.. +.\} .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. @@ -129,7 +124,11 @@ .\" ======================================================================== .\" .IX Title "PARALLEL 1" -.TH PARALLEL 1 "2007-09-14" "perl v5.8.8" "User Contributed Perl Documentation" +.TH PARALLEL 1 "2009-02-17" "perl v5.10.0" "User Contributed Perl Documentation" +.\" For nroff, turn off justification. Always turn off hyphenation; it makes +.\" way too many mistakes in technical documents. +.if n .ad l +.nh .SH "NAME" parallel \- build and execute command lines from standard input in parallel .SH "SYNOPSIS" @@ -204,9 +203,12 @@ output from different commands to be mixed. Can be reversed with \fB\-g\fR. .IP "\fB\-v\fR" 9 .IX Item "-v" Verbose. Print the job to be run. Can be reversed with \fB\-s\fR. +.IP "\fB\-x\fR" 9 +.IX Item "-x" +xargs. Insert as many arguments as the command line length permits. .SH "EXAMPLE 1: Working as cat | sh. Ressource inexpensive jobs and evaluation" .IX Header "EXAMPLE 1: Working as cat | sh. Ressource inexpensive jobs and evaluation" -\&\fBparallel\fR can work similar to \fBcat | sh\fR. +\&\fBparallel\fR can work similar to \fBcat | sh\fR. .PP A ressource inexpensive job is a job that takes very little \s-1CPU\s0, disk I/O and network I/O. Ping is an example of a ressource inexpensive @@ -232,7 +234,7 @@ As there is not a \fBcommand\fR the option \fB\-c\fR is default because the jobs needs to be evaluated by the shell. .SH "EXAMPLE 2: Working as xargs \-n1. Argument appending" .IX Header "EXAMPLE 2: Working as xargs -n1. Argument appending" -\&\fBparallel\fR can work similar to \fBxargs \-n1\fR. +\&\fBparallel\fR can work similar to \fBxargs \-n1\fR. .PP To output all html files run: .PP @@ -264,8 +266,8 @@ thumb_./foo/bar.jpg\fR would clearly be wrong). It will result in files like ./foo/bar.jpg_thumb.jpg. If that is not wanted this can fix it: .PP .Vb 3 -\& find . \-name '*.jpg' | \e -\& perl \-pe 'chomp; $a=$_; s:/([^/]+)$:/thumb_$1:; $_="convert \-geometry 120 $a $_\en"' | \e +\& find . \-name \*(Aq*.jpg\*(Aq | \e +\& perl \-pe \*(Aqchomp; $a=$_; s:/([^/]+)$:/thumb_$1:; $_="convert \-geometry 120 $a $_\en"\*(Aq | \e \& parallel \-c \-j +0 .Ve .PP @@ -274,8 +276,8 @@ characters (such as space or quotes). If you have \fBren\fR installed this is a better solution: .PP .Vb 2 -\& find . \-name '*.jpg' | parallel \-j +0 convert \-geometry 120 {} {}_thumb.jpg -\& find . \-name '*_thumb.jpg' | ren 's:/([^/]+)_thumb.jpg$:/thumb_$1:' +\& find . \-name \*(Aq*.jpg\*(Aq | parallel \-j +0 convert \-geometry 120 {} {}_thumb.jpg +\& find . \-name \*(Aq*_thumb.jpg\*(Aq | ren \*(Aqs:/([^/]+)_thumb.jpg$:/thumb_$1:\*(Aq .Ve .SH "EXAMPLE 4: Substitution and redirection" .IX Header "EXAMPLE 4: Substitution and redirection" @@ -294,6 +296,10 @@ A job can consist of several commands. This will print the number of files in each directory: .PP \&\fBls | parallel 'echo \-n {}\*(L" \*(R"; ls {}|wc \-l'\fR +.PP +To put the output in a file called .dir: +.PP +\&\fBls | parallel '(echo \-n {}\*(L" \*(R"; ls {}|wc \-l) \fR>\fB {}.dir'\fR .SH "QUOTING" .IX Header "QUOTING" For more advanced use quoting may be an issue. The following will @@ -313,17 +319,23 @@ the quoting by using option \fB\-q\fR: However, this means you cannot make the shell interpret special characters. For example this \fBwill not work\fR: .PP -\&\fBls | parallel \-q "diff {} foo \fR>\fB{}.diff"\fR +\&\fBls | parallel \-q "diff {} foo \fR>\fB{}.diff"\fR .PP because > needs to be interpreted by the shell. .PP +If you get errors like: +.PP +\&\fBsh: \-c: line 0: syntax error near unexpected token\fR +.PP +then you might try using \fB\-q\fR. +.PP \&\fBConclusion\fR: To avoid dealing with the quoting problems it may be easier just to write a small script and have \fBparallel\fR call that script. .SH "DIFFERENCES BETWEEN xargs/find \-exec AND parallel" .IX Header "DIFFERENCES BETWEEN xargs/find -exec AND parallel" \&\fBxargs\fR and \fBfind \-exec\fR offer some of the same possibilites as -\&\fBparallel\fR. +\&\fBparallel\fR. .PP \&\fBfind \-exec\fR only works on files. So processing other input (such as hosts or URLs) will require creating these inputs as files. \fBfind @@ -336,7 +348,7 @@ hosts or URLs) will require creating these inputs as files. \fBfind (\-0 and \e0 instead of \en), \fBlocate\fR (requires using \-0), \fBfind\fR (requires using \-print0), \fBgrep\fR (requires user to use \-z or \-Z). .PP -The input \fIcan\fR be fixed for \fBxargs\fR with: +The input \fIcan\fR be fixed for \fBxargs\fR with: .PP tr '\en' '\e0' .PP @@ -347,22 +359,37 @@ cat | tr '\en' '\e0' | xargs \-0 \-n1 \fIcommand\fR \&\fBxargs\fR can run a given number of jobs in parallel, but has no support for running no_of_cpus jobs in parallel. .PP -\&\fBxargs\fR has no support for grouping the output, so output may run -together, so the first half of a line is from one process and the last -half of the line is from another process. +\&\fBxargs\fR has no support for grouping the output, therefore output may +run together, e.g. the first half of a line is from one process and +the last half of the line is from another process. .PP If no command is given to \fBxargs\fR it defaults to /bin/echo. So the \&\fBcat | sh\fR functionality is missing. +.PP +Quoting in \fBxargs\fR works like \fB\-q\fR in \fBparallel\fR. Doing \fBls | +parallel "wc {} \fR> \fB{}.wc"\fR using \fBxargs\fR seems to be impossible. .SH "BUGS" .IX Header "BUGS" Filenames beginning with '\-' can cause some commands to give unexpected results, as it will often be interpreted as an option. +.PP +This takes up all memory: +.PP +.Vb 1 +\& seq 1 1000000000 | parallel very_loong_command +.Ve +.PP +Should be fixed by only generating a new command when needed +(i.e. when a command has finished). +.PP +If you have a lot of filehandles, then computing the max no +takes a long time. .SH "REPORTING BUGS" .IX Header "REPORTING BUGS" Report bugs to . .SH "AUTHOR" .IX Header "AUTHOR" -Copyright (C) 2007 Ole Tange, http://ole.tange.dk +Copyright (C) 2007\-10\-18 Ole Tange, http://ole.tange.dk .SH "LICENSE" .IX Header "LICENSE" Copyright (C) 2007 Free Software Foundation, Inc. diff --git a/unittest/actual-results/test02 b/unittest/actual-results/test02 index 29391ded..3a8e2719 100644 --- a/unittest/actual-results/test02 +++ b/unittest/actual-results/test02 @@ -1,13 +1,13 @@ . -./中国 (Zhōngguó) -./中国 (Zhōngguó)/thumb_China's (中国) road.jpg -./中国 (Zhōngguó)/China's (中国) road.jpg ./a ./a/foo2 -./a/foo ./a/bar -./2-col.txt +./a/foo ./b -./b/foo ./b/bar +./b/foo +./2-col.txt +./中国 (Zhōngguó) +./中国 (Zhōngguó)/China's (中国) road.jpg +./中国 (Zhōngguó)/thumb_China's (中国) road.jpg ./1-col.txt diff --git a/unittest/actual-results/test05 b/unittest/actual-results/test05 index 7b0b880c..62ecb2bf 100644 --- a/unittest/actual-results/test05 +++ b/unittest/actual-results/test05 @@ -4,5 +4,5 @@ Removing files 4e2ffc66811f839854f2f0071c1e0541 - There are 6246 dirs with files Removing dirs -14e628fb222c872cf383280269b2397f - +1c91bf0327094531133e6ad95d2e23f5 - There are 1 dirs with files diff --git a/unittest/actual-results/test08 b/unittest/actual-results/test08 new file mode 100644 index 00000000..c3219ebb --- /dev/null +++ b/unittest/actual-results/test08 @@ -0,0 +1,2 @@ +b +d diff --git a/unittest/input-files/test08/a b/unittest/input-files/test08/a new file mode 100644 index 00000000..75c86238 --- /dev/null +++ b/unittest/input-files/test08/a @@ -0,0 +1,3 @@ +flyp +_PRE 8 +hatchname> 8 diff --git a/unittest/input-files/test08/b b/unittest/input-files/test08/b new file mode 100644 index 00000000..dde9fdd6 --- /dev/null +++ b/unittest/input-files/test08/b @@ -0,0 +1,3 @@ +flyp +_PRE 9 +hatchname> 8 diff --git a/unittest/input-files/test08/c b/unittest/input-files/test08/c new file mode 100644 index 00000000..3712521e --- /dev/null +++ b/unittest/input-files/test08/c @@ -0,0 +1,3 @@ +flyp +_PRE 19 +hatchname> 19 diff --git a/unittest/input-files/test08/d b/unittest/input-files/test08/d new file mode 100644 index 00000000..2f3c6a95 --- /dev/null +++ b/unittest/input-files/test08/d @@ -0,0 +1,4 @@ +flyp +_PRE 19 +hatchname> 9 + diff --git a/unittest/tests-to-run/test08.sh b/unittest/tests-to-run/test08.sh new file mode 100644 index 00000000..6cd5b696 --- /dev/null +++ b/unittest/tests-to-run/test08.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +cd input-files/test08 + +ls | parallel -q perl -ne '/_PRE (\d+)/ and $p=$1; /hatchname> (\d+)/ and $1!=$p and print $ARGV,"\n"' diff --git a/unittest/wanted-results/test02 b/unittest/wanted-results/test02 index 29391ded..3a8e2719 100644 --- a/unittest/wanted-results/test02 +++ b/unittest/wanted-results/test02 @@ -1,13 +1,13 @@ . -./中国 (Zhōngguó) -./中国 (Zhōngguó)/thumb_China's (中国) road.jpg -./中国 (Zhōngguó)/China's (中国) road.jpg ./a ./a/foo2 -./a/foo ./a/bar -./2-col.txt +./a/foo ./b -./b/foo ./b/bar +./b/foo +./2-col.txt +./中国 (Zhōngguó) +./中国 (Zhōngguó)/China's (中国) road.jpg +./中国 (Zhōngguó)/thumb_China's (中国) road.jpg ./1-col.txt diff --git a/unittest/wanted-results/test05 b/unittest/wanted-results/test05 index 7b0b880c..62ecb2bf 100644 --- a/unittest/wanted-results/test05 +++ b/unittest/wanted-results/test05 @@ -4,5 +4,5 @@ Removing files 4e2ffc66811f839854f2f0071c1e0541 - There are 6246 dirs with files Removing dirs -14e628fb222c872cf383280269b2397f - +1c91bf0327094531133e6ad95d2e23f5 - There are 1 dirs with files diff --git a/unittest/wanted-results/test08 b/unittest/wanted-results/test08 new file mode 100644 index 00000000..c3219ebb --- /dev/null +++ b/unittest/wanted-results/test08 @@ -0,0 +1,2 @@ +b +d