From 35939753d6818a402f359faba756049a405508a7 Mon Sep 17 00:00:00 2001
From: Ole Tange <ole@tange.dk>
Date: Wed, 28 May 2014 23:45:13 +0200
Subject: [PATCH] parallel: --pipepart now works with --regexp

---
 doc/release_new_version |  4 +++
 src/parallel            | 56 +++++++++++++++++++++++++++--------------
 src/parallel.pod        |  7 ++++++
 3 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/doc/release_new_version b/doc/release_new_version
index 12375362..610cfbe0 100644
--- a/doc/release_new_version
+++ b/doc/release_new_version
@@ -228,6 +228,10 @@ Haiku of the month:
 
 New in this release:
 
+* GNU Parallel was cited in: Ferroelectric contributions to anomalous hysteresis in hybrid perovskite solar cells http://arxiv.org/pdf/1405.5810.pdf
+
+* Processes Paralleling to Speed up Computing and Tasks Execution in Linux http://kukuruku.co/hub/nix/processes-paralleling-to-speed-up-computing-and-tasks-execution-in-linux
+
 * Speeding up grep log queries with GNU Parallel http://www.tripwire.com/state-of-security/incident-detection/speeding-grep-queries-gnu-parallel/
 
 * Bug fixes and man page updates.
diff --git a/src/parallel b/src/parallel
index e7612f11..6b6d2f4a 100755
--- a/src/parallel
+++ b/src/parallel
@@ -211,14 +211,11 @@ if($opt::halt_on_error) {
 
 sub __PIPE_MODE__ {}
 
-# parallel --part-pipe -a bigfile cat
-# =>
-# (dd 1) | cat
-# (dd 2) | cat
-# (dd 3) | cat
-
-
 sub pipe_part_files {
+    # Input:
+    #   $file = the file to read
+    # Returns:
+    #   @commands to run to pipe the blocks of the file to the command given
     my ($file) = @_;
     # find positions
     my @pos = find_split_positions($file,$opt::blocksize);
@@ -258,13 +255,24 @@ sub find_split_positions {
 	my $buf;
 	seek($fh, $pos, 0) || die;
 	while(read($fh,substr($buf,length $buf,0),$dd_block_size)) {
-	    # If match $recend$recstart => Record position
-	    my $i = index($buf,$recendrecstart);
-	    if($i != -1) {
-		push(@pos,$pos+$i);
-		# Start looking for next record _after_ this match
-		$pos += $i;
-		last;
+	    if($opt::regexp) {
+		# If match /$recend$recstart/ => Record position
+		if($buf =~ /(.*$recend)$recstart/os) {
+		    my $i = length($1);
+		    push(@pos,$pos+$i);
+		    # Start looking for next record _after_ this match
+		    $pos += $i;
+		    last;
+		}
+	    } else {
+		# If match $recend$recstart => Record position
+		my $i = index($buf,$recendrecstart);
+		if($i != -1) {
+		    push(@pos,$pos+$i);
+		    # Start looking for next record _after_ this match
+		    $pos += $i;
+		    last;
+		}
 	    }
 	}
     }
@@ -1303,6 +1311,10 @@ sub shell_quote {
 }
 
 sub shell_quote_empty {
+    # Inputs:
+    #   @strings = strings to be quoted
+    # Returns:
+    #   @quoted_strings = empty strings quoted as ''.
     my @strings = shell_quote(@_);
     for my $a (@strings) {
 	if($a eq "") {
@@ -1344,7 +1356,6 @@ sub shell_quote_file {
     return $a;
 }
 
-
 sub maybe_quote {
     # If $Global::quoting is set then quote the string so shell will not expand any special chars
     # Else do not quote
@@ -1385,9 +1396,13 @@ sub shell_unquote {
         if(not defined $arg) {
             $arg = "";
         }
-        $arg =~ s/'\n'/\n/g; # filenames with '\n' is quoted using \'
+	# filenames with '\n' is quoted using \'\n\'
+        $arg =~ s/'\n'/\n/g;
+	# Non-printables
         $arg =~ s/\\([\002-\011\013-\032])/$1/g;
+	# Shell special chars
         $arg =~ s/\\([\#\?\`\(\)\{\}\*\>\<\~\|\; \"\!\$\&\'])/$1/g;
+	# Backslash
         $arg =~ s/\\\\/\\/g;
     }
     return wantarray ? @strings : "@strings";
@@ -1416,7 +1431,7 @@ sub save_stdin_stdout_stderr {
 }
 
 sub enough_file_handles {
-    # check that we have enough filehandles available for starting
+    # Check that we have enough filehandles available for starting
     # another job
     # Returns:
     #   1 if ungrouped (thus not needing extra filehandles)
@@ -1441,7 +1456,7 @@ sub enough_file_handles {
 }
 
 sub open_or_exit {
-    # Open a file name or exit if the fille cannot be opened
+    # Open a file name or exit if the file cannot be opened
     # Inputs:
     #   $file = filehandle or filename to open
     # Returns:
@@ -1497,8 +1512,10 @@ sub start_more_jobs {
 	return $jobs_started;
     }
     if($Global::max_procs_file) {
+	# --jobs filename
 	my $mtime = (stat($Global::max_procs_file))[9];
 	if($mtime > $Global::max_procs_file_last_mod) {
+	    # file changed: Force re-computing max_jobs_running
 	    $Global::max_procs_file_last_mod = $mtime;
 	    for my $sshlogin (values %Global::host) {
 		$sshlogin->set_max_jobs_running(undef);
@@ -1511,6 +1528,7 @@ sub start_more_jobs {
 	# thus distribute the jobs on the --sshlogins round robin
 	for my $sshlogin (values %Global::host) {
 	    if($Global::JobQueue->empty() and not $opt::pipe) {
+		# No more jobs in the queue
 		last;
 	    }
 	    debug("Running jobs before on ".$sshlogin->string().": ".$sshlogin->jobs_running()."\n");
@@ -1528,7 +1546,7 @@ sub start_more_jobs {
 		    next;
 		}
 		if($opt::delay and $opt::delay > ::now() - $Global::newest_starttime) {
-		    # It has been too short since
+		    # It has been too short since last start
 		    next;
 		}
 		debug($sshlogin->string()." has ".$sshlogin->jobs_running()
diff --git a/src/parallel.pod b/src/parallel.pod
index dcbd3ef3..ede3bf20 100644
--- a/src/parallel.pod
+++ b/src/parallel.pod
@@ -2583,6 +2583,13 @@ files are passed to the second B<parallel> that runs B<sort -m> on the
 files before it removes the files. The output is saved to
 B<bigfile.sort>.
 
+GNU B<parallel>'s B<--pipe> maxes out at around 100 MB/s because every
+byte has to be copied through GNU B<parallel>. But if B<bigfile> is a
+real (seekable) file GNU B<parallel> can by-pass the copying and send
+the parts directly to the program:
+
+B<parallel --pipepart --block 100m -a bigfile --files sort | parallel -Xj1 sort -m {} ';' rm {} >>B<bigfile.sort>
+
 
 =head1 EXAMPLE: Running more than 500 jobs workaround