parallel: Implemented --pipepart

2024-11-26 07:57:58 +00:00 · 2014-04-09 05:11:51 +02:00 · 2014-04-09 05:11:51 +02:00 · 34e131b894
parent f2c1f65a5a
commit 34e131b894
2 changed files with 231 additions and 36 deletions
--- a/doc/release_new_version
+++ b/doc/release_new_version
@ -158,6 +158,11 @@ git commit -a
 Released as 20140X22 ('').
 TAG=MyTag
 YYYYMMDD=`yyyymmdd`
 git tag -s -m "Released as $YYYYMMDD ('$TAG')" $TAG
 git tag -s -m "Released as $YYYYMMDD ('$TAG')" $YYYYMMDD
 == Update Savannah ==
--- a/src/parallel
+++ b/src/parallel
@ -68,9 +68,13 @@ if(@ARGV) {
 }
 my @fhlist;
-@fhlist = map { open_or_exit($_) } @opt::a;
+if($opt::pipepart) {
-if(not @fhlist and not $opt::pipe) {
+    @fhlist = map { open_or_exit($_) } "/dev/null";
 } else {
    @fhlist = map { open_or_exit($_) } @opt::a;
    if(not @fhlist and not $opt::pipe) {
 	@fhlist = (*STDIN);
    }
 }
 if($opt::skip_first_line) {
    # Skip the first line for the first file handle
@ -121,7 +125,7 @@ if($opt::nonall or $opt::onall) {
 $Global::JobQueue = JobQueue->new(
    $command,\@fhlist,$Global::ContextReplace,$number_of_args,\@Global::ret_files);
-if($opt::pipe and @opt::a) {
+if(0 and $opt::pipe and @opt::a) {
    # ... | parallel --pipe cmd ::: arg1 arg2
    # The command to run is:
    # tee >((cmd arg1) >/tmp/tmp1 2>/tmp/err1) >((cmd arg2) >/tmp/tmp2 2>/tmp/err2) >/dev/null
@ -153,6 +157,13 @@ if($opt::eta or $opt::bar) {
    # Count the number of jobs before starting any
    $Global::JobQueue->total_jobs();
 }
 if($opt::pipepart) {
    my @cmdlines;
    for(@opt::a) {
 	push(@cmdlines, pipe_part_files($_));
    }
    $Global::JobQueue->{'commandlinequeue'}->unget(@cmdlines);
 }
 for my $sshlogin (values %Global::host) {
    $sshlogin->max_jobs_running();
 }
@ -163,9 +174,12 @@ if($Global::semaphore) {
    $sem = acquire_semaphore();
 }
 $SIG{TERM} = \&start_no_new_jobs;
 start_more_jobs();
-if($opt::pipe) {
+if(not $opt::pipepart) {
    if($opt::pipe) {
 	spreadstdin();
    }
 }
 ::debug("Start draining\n");
 drain_job_queue();
@ -197,6 +211,170 @@ if($opt::halt_on_error) {
 sub __PIPE_MODE__ {}
 # parallel --part-pipe -a bigfile cat
 # =>
 # (dd 1) | cat
 # (dd 2) | cat
 # (dd 3) | cat
 sub pipe_part_files {
    my ($file) = @_;
    # find positions
    my @pos = find_split_positions($file,$opt::blocksize);
    # unshift job with dd_prefix
    my @cmdlines;
    for(my $i=0; $i<$#pos; $i++) {
 	my $cmd = $Global::JobQueue->{'commandlinequeue'}->get();
 	# TODO prepend --header (how?) 
 	$cmd->{'replaced'} = cat_partial($file, $pos[$i],$pos[$i+1])."|" .
 	    $cmd->{'replaced'};
 	::debug("Unget ".$cmd->{'replaced'}."\n");
 	push(@cmdlines, $cmd);
    }
    return @cmdlines;
 }
 sub find_split_positions {
    # Input:
    #   $file = the file to read
    #   $block = (minimal) --block-size of each chunk
    # Uses:
    #   $opt::recstart
    #   $opt::recend
    # Returns:
    #   @positions of block start/end
    my($file, $block) = @_;
    my $size = -s $file;
    # The optimal dd blocksize for mint, redhat, solaris, openbsd = 2^17..2^20
    # The optimal dd blocksize for freebsd = 2^15..2^17
    my $dd_block_size = 131072; # 2^17
    my @pos;
    my ($recstart,$recend) = recstartrecend();
    my $recendrecstart = $recend.$recstart;
    open (my $fh, "<", $file) || die;
    push(@pos,0);
    for(my $pos = $block; $pos < $size; $pos += $block) {
 	my $buf;
 	seek($fh, $pos, 0) || die;
 	while(read($fh,substr($buf,length $buf,0),$dd_block_size)) {
 	    # If match $recend$recstart => Record position
 	    my $i = index($buf,$recendrecstart);
 	    if($i != -1) {
 		push(@pos,$pos+$i);
 		# Start looking for next record _after_ this match
 		$pos += $i;
 		last;
 	    }
 	}
    }
    push(@pos,$size);
    close $fh;
    return @pos;
 }
 sub cat_partial {
    # Input:
    #   $file = the file to read
    #   $start = start byte
    #   $end = end byte
    # Returns:
    #   Efficent perl command to copy $start..$end to stdout
    my($file, $start, $end) = @_;
    my $len = $end - $start;
    return "<". shell_quote_scalar($file) .
 	q{ perl -e 'sysseek(STDIN,shift,0) || die; $left = shift; while($read = sysread(STDIN,$buf, ($left > 32768 ? 32768 : $left))){ $left -= $read; syswrite(STDOUT,$buf); }' } .
 	" ".$start." ".$len;
 }
 sub _dd_prefix_part_job {
    # Input:
    #  $file = the file to read
    #  $start = start byte
    #  $end = end byte
    # Returns:
    #  Efficent dd command to copy $start..$end to stdout
    my($file, $start, $end) = @_;
    # The optimal blocksize for mint, redhat, solaris, openbsd = 2^17..2^20
    # The optimal blocksize for freebsd = 2^15..2^17
    my $big_block = 131072;
    my $small_block = 512;
    # Copy:
    #   start .. 512*n: 1 byte at a time (1 MB/s)
    #   512*n .. 131072*n: 512 bytes at a time (300 MB/s)
    #   131072*n1 .. 131072*n2: 131072 bytes at a time (1 GB/s)
    #   131072*n .. 512*n: 512 bytes at a time (medium speed)
    #   512*n .. end: 1 byte at a time
    # start = 1234;
    # end = 4321;
    # len = end - start = 3087;
    my $len = $end - $start;
    # copy1_start = start;
    my $copy1_start = $start;
    # copy1_len = (10 - 1234) % 10 = (small_block - copy1_start) % small_block = 6;
    my $copy1_len = ($small_block - $copy1_start) % $small_block;
    # copy1_bs = 1;
    my $copy1_bs = 1;
    # copy1_count = 6 / 1  = copy1_len / copy1_bs = 6;
    my $copy1_count = $copy1_len / $copy1_bs;
    # copy1_skip = 1234 = start / copy1_bs;
    my $copy1_skip = $start / $copy1_bs;
    # copy2_start = start + copy1_len = 1240;
    my $copy2_start = $start + $copy1_len;
    # copy2_len = (100 - 1240) % 100 = (big_block - copy2_start) % big_block = 60;
    my $copy2_len = ($big_block - $copy2_start) % $big_block;
    # copy2_bs = small_block = 10;
    my $copy2_bs = $small_block;
    # copy2_count = 60 / 10  = copy2_len / copy2_bs = 6;
    my $copy2_count = $copy2_len / $copy2_bs;
    # copy2_skip = 1240 / 10 = copy2_start / copy2_bs = 124
    my $copy2_skip = $copy2_start / $copy2_bs;
    # copy5_len = 4321 % 10 = end % small_block = 1;
    my $copy5_len = $end % $small_block;
    # copy5_start = 4321 - 1 = end - copy5_len = 4320;
    my $copy5_start = $end - $copy5_len;
    # copy5_bs = 1;
    my $copy5_bs = 1;
    # copy5_count = 1 / 1  = copy5_len / copy5_bs = 1;
    my $copy5_count = $copy5_len / $copy5_bs;
    # copy5_skip = 4320 / 1 = copy5_start / copy5_bs = 4320
    my $copy5_skip = $copy5_start / $copy5_bs;
    # copy4_len = 4320 % 100 = copy5_start % big_block = 20;
    my $copy4_len = $copy5_start % $big_block;
    # copy4_start = end - copy5_len - copy4_len = 4300;
    my $copy4_start = $end - $copy5_len - $copy4_len;
    # copy4_bs = small_block = 10;
    my $copy4_bs = $small_block;
    # copy4_count = 20 / 10  = copy4_len / copy4_bs = 2;
    my $copy4_count = $copy4_len / $copy4_bs;
    # copy4_skip = 4300 / 10 = copy4_start / copy4_bs = 430
    my $copy4_skip = $copy4_start / $copy4_bs;
    # copy3_start = start + copy1_len + copy2_len = 1300;
    my $copy3_start = $start + $copy1_len + $copy2_len;
    # copy3_len = 4300 - 1300 = copy4_start - copy3_start = 3000;
    my $copy3_len = $copy4_start - $copy3_start;
    # copy3_bs = big_block = 100;
    my $copy3_bs = $big_block;
    # copy3_count = 3000 / 100  = copy3_len / copy3_bs = 3000;
    my $copy3_count = $copy3_len / $copy3_bs;
    # copy3_skip = 1300 / 100 = copy3_start / copy3_bs = 13
    my $copy3_skip = $copy3_start / $copy3_bs;
    return
 	"dd if=$file bs=$copy1_bs skip=$copy1_skip count=$copy1_count iflag=fullblock;" .
 	"dd if=$file bs=$copy2_bs skip=$copy2_skip count=$copy2_count iflag=fullblock;" .
 	"dd if=$file bs=$copy3_bs skip=$copy3_skip count=$copy3_count iflag=fullblock;" .
 	"dd if=$file bs=$copy4_bs skip=$copy4_skip count=$copy4_count iflag=fullblock;" .
 	"dd if=$file bs=$copy5_bs skip=$copy5_skip count=$copy5_count iflag=fullblock;"
 	;
 }
 sub spreadstdin {
    # read a record
    # Spawn a job and print the record to it.
@ -213,30 +391,7 @@ sub spreadstdin {
 	    }
 	}
    }
-    my ($recstart,$recend);
+    my ($recstart,$recend) = recstartrecend();
    if(defined($opt::recstart) and defined($opt::recend)) {
 	# If both --recstart and --recend is given then both must match
 	$recstart = $opt::recstart;
 	$recend = $opt::recend;
    } elsif(defined($opt::recstart)) {
 	# If --recstart is given it must match start of record
 	$recstart = $opt::recstart;
 	$recend = "";
    } elsif(defined($opt::recend)) {
 	# If --recend is given then it must match end of record
 	$recstart = "";
 	$recend = $opt::recend;
    }
    if($opt::regexp) {
 	# If $recstart/$recend contains '|' this should only apply to the regexp
 	$recstart = "(?:".$recstart.")";
 	$recend = "(?:".$recend.")";
    } else {
 	# $recstart/$recend = printf strings (\n)
 	$recstart =~ s/\\([rnt\'\"\\])/"qq|\\$1|"/gee;
 	$recend =~ s/\\([rnt\'\"\\])/"qq|\\$1|"/gee;
    }
    my $recendrecstart = $recend.$recstart;
    my $chunk_number = 1;
    my $one_time_through;
@ -355,6 +510,39 @@ sub spreadstdin {
    }
 }
 sub recstartrecend {
    # Uses:
    #   $opt::recstart
    #   $opt::recend
    # Returns:
    #   $recstart,$recend with default values and regexp conversion
    my($recstart,$recend);
    if(defined($opt::recstart) and defined($opt::recend)) {
 	# If both --recstart and --recend is given then both must match
 	$recstart = $opt::recstart;
 	$recend = $opt::recend;
    } elsif(defined($opt::recstart)) {
 	# If --recstart is given it must match start of record
 	$recstart = $opt::recstart;
 	$recend = "";
    } elsif(defined($opt::recend)) {
 	# If --recend is given then it must match end of record
 	$recstart = "";
 	$recend = $opt::recend;
    }
    if($opt::regexp) {
 	# If $recstart/$recend contains '|' this should only apply to the regexp
 	$recstart = "(?:".$recstart.")";
 	$recend = "(?:".$recend.")";
    } else {
 	# $recstart/$recend = printf strings (\n)
 	$recstart =~ s/\\([rnt\'\"\\])/"qq|\\$1|"/gee;
 	$recend =~ s/\\([rnt\'\"\\])/"qq|\\$1|"/gee;
    }
    return ($recstart,$recend);
 }
 sub nindex {
    # See if string is in buffer N times
    # Returns:
@ -591,6 +779,7 @@ sub options_hash {
 	 "header=s" => \$opt::header,
 	 "cat" => \$opt::cat,
 	 "fifo" => \$opt::fifo,
 	 "pipepart" => \$opt::pipepart,
 	);
 }
@ -630,7 +819,7 @@ sub get_options_from_array {
 sub parse_options {
    # Returns: N/A
    # Defaults:
-    $Global::version = 20140401;
+    $Global::version = 20140409;
    $Global::progname = 'parallel';
    $Global::infinity = 2**31;
    $Global::debug = 0;
@ -3842,7 +4031,6 @@ sub sshcommand_of_sshlogin {
                    # Run a sleep that outputs data, so it will discover if the ssh connection closes.
                    my $sleep = ::shell_quote_scalar('$|=1;while(1){sleep 1;print "foo\n"}');
                    my @master = ("ssh", "-tt", "-MTS", $control_path, $serverlogin, "perl", "-e", $sleep);
                    ::debug("@master\n");
                    exec(@master);
                }
            }
@ -5206,9 +5394,11 @@ sub new {
 	    $command = $Global::replace{'{}'};
 	} elsif($opt::pipe) {
 	    # With --pipe you can have ::: or not
-	    if(@opt::a) {
+#	    if(@opt::a) {
-		$command .=" ".$Global::replace{'{}'};
+#		$command .=" ".$Global::replace{'{}'};
-	    }
+#	    }
 	} elsif($opt::pipepart) {
 	    # With --pipe-part you can have nothing
        } else {
 	    # Add {} to the command if there are no {...}'s
 	    $command .=" ".$Global::replace{'{}'};
@ -5871,7 +6061,7 @@ sub get {
 	    );
 	$cmd_line->populate();
 	::debug("cmd_line->number_of_args ".$cmd_line->number_of_args()."\n");
-	if($opt::pipe) {
+	if($opt::pipe or $opt::pipepart) {
 	    if($cmd_line->replaced() eq "") {
 		# Empty command - pipe requires a command
 		::error("--pipe must have a command to pipe into (e.g. 'cat').\n");