From bbd336643cf14301dcf5e5eceabc0579dd9d821e Mon Sep 17 00:00:00 2001
From: Ole Tange <ole@tange.dk>
Date: Sat, 29 Jul 2017 23:49:00 +0200
Subject: [PATCH] parallel: Small bug in {#prefix} replacement string.

---
 doc/release_new_version       |  27 +---
 src/parallel                  |  11 +-
 src/parallel_alternatives.pod | 291 ++++++++++++++++++++++++++++++----
 src/parallel_design.pod       |   2 +-
 4 files changed, 276 insertions(+), 55 deletions(-)
diff --git a/doc/release_new_version b/doc/release_new_version
index 597f538d..e8e97c07 100644
--- a/doc/release_new_version
+++ b/doc/release_new_version
@@ -196,17 +196,16 @@ file:///home/tange/privat/parallel/doc/release_new_version
 from:tange@gnu.org
 to:parallel@gnu.org, bug-parallel@gnu.org
 
-Subject: GNU Parallel 20170622 ('Grenfell') released <<[stable]>>
+Subject: GNU Parallel 20170822 ('<<>>') released <<[stable]>>
 
-GNU Parallel 20170622 ('Grenfell') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
+GNU Parallel 20170822 ('<<>>') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
 
 <<No new functionality was introduced so this is a good candidate for a stable release.>>
 
-Quote of the month:
+Haiku of the month:
 
-  I don't care
-  I just need to get shit done
-    -- Sab
+  <<>>
+  
 
 New in this release:
 
@@ -215,21 +214,9 @@ New in this release:
   http://meta.askubuntu.com/a/16750/22307
   http://meta.serverfault.com/a/9040/45704
 
-* GNU Parallel was cited in: Hayabusa: Simple and Fast Full-Text Search Engine for Massive System Log Data http://dl.acm.org/citation.cfm?id=3095788
+* GNU Parallel was cited in:
 
-* コマンドの並列化を行える『GNU parallel』の個人的使い方まとめhttps://orebibou.com/2017/07/%E3%82%B3%E3%83%9E%E3%83%B3%E3%83%89%E3%81%AE%E4%B8%A6%E5%88%97%E5%8C%96%E3%82%92%E8%A1%8C%E3%81%88%E3%82%8B%E3%80%8Egnu-parallel%E3%80%8F%E3%81%AE%E5%80%8B%E4%BA%BA%E7%9A%84%E4%BD%BF%E3%81%84/
-
-* https://blog.archive.org/2017/07/10/how-to-play-and-play-with-78rpm-record-transfers/
-
-* https://gxnotes.com/article/130363.html
-
-* https://sgillies.net/2017/05/18/rfc-8142-geojson-text-sequences.html
-
-* https://lukas.zapletalovi.com/2017/07/git-auto-fetch-script-i-run-every-day.html
-
-* http://crazyhottommy.blogspot.de/2017/07/cores-cpus-and-threads.html
-
-' https://lukas.zapletalovi.com/2017/07/git-auto-fetch-script-i-run-every-day.html
+* https://medium.com/@nornagon/today-i-learned-gnu-parallel-plate-tectonics-9fcf24045e63
 
 <<Citation not OK: BAMClipper: removing primers from alignments to minimize false-negative mutations in amplicon next-generation sequencing https://www.nature.com/articles/s41598-017-01703-6>>
 
diff --git a/src/parallel b/src/parallel
index 2bd86e82..b7d2384b 100755
--- a/src/parallel
+++ b/src/parallel
@@ -1042,6 +1042,7 @@ sub options_hash {
 	 "internal-pipe-means-argfiles" => \$opt::internal_pipe_means_argfiles,
 	 "Y" => \$opt::retired,
          "skip-first-line" => \$opt::skip_first_line,
+	 "bug" => \$opt::bug,
 	 "header=s" => \$opt::header,
 	 "cat" => \$opt::cat,
 	 "fifo" => \$opt::fifo,
@@ -1102,6 +1103,7 @@ sub parse_options {
     if($opt::nokeeporder) { $opt::keeporder = undef; }
 
     if(@opt::v) { $Global::verbose = $#opt::v+1; } # Convert -v -v to v=2
+    if($opt::bug) { ::die_bug("test-bug"); }
     $Global::debug = $opt::D;
     $Global::shell = $ENV{'PARALLEL_SHELL'} || parent_shell($$)
 	|| $ENV{'SHELL'} || "/bin/sh";
@@ -1419,7 +1421,7 @@ sub init_globals {
 	 # Bash ${a:2:3}
 	 '{:(\d+?):(\d+?)}' => '$_ = substr($_,$$1,$$2);',
 	 # Bash ${a#bc}
-	 '{#([^#][^}]*?)}' => 's/^$$1//;',
+	 '{#([^#}][^}]*?)}' => 's/^$$1//;',
 	 # Bash ${a%def}
 	 '{%([^}]+?)}' => 's/$$1$//;',
 	 # Bash ${a/def/ghi} ${a/def/}
@@ -3674,7 +3676,7 @@ sub onall {
 	     ((defined $opt::timeout) ? "--timeout ".$opt::timeout : ""),
 	     ((defined $opt::ungroup) ? "-u" : ""),
 	     ((defined $opt::tee) ? "--tee" : ""),
-	     ((defined $opt::workdir) ? "--wd ".$opt::workdir : ""),
+	     ((defined $opt::workdir) ? "--wd ".::shell_quote_scalar($opt::workdir) : ""),
 	     (@Global::transfer_files ? map { "--tf ".::shell_quote_scalar($_) }
 	      @Global::transfer_files : ""),
 	     (@Global::ret_files ? map { "--return ".::shell_quote_scalar($_) }
@@ -4089,7 +4091,10 @@ sub die_bug {
     my $bugid = shift;
     print STDERR
 	("$Global::progname: This should not happen. You have found a bug.\n",
-	 "Please contact <parallel\@gnu.org> and include:\n",
+	 "Please contact <parallel\@gnu.org> and follow\n",
+	 "https://www.gnu.org/software/parallel/man.html#REPORTING-BUGS\n",
+	 "\n",
+	 "Include this in the report:\n",
 	 "* The version number: $Global::version\n",
 	 "* The bugid: $bugid\n",
 	 "* The command line being run\n",
diff --git a/src/parallel_alternatives.pod b/src/parallel_alternatives.pod
index 4a4da73e..893e85ed 100644
--- a/src/parallel_alternatives.pod
+++ b/src/parallel_alternatives.pod
@@ -866,71 +866,292 @@ opposite GNU B<parallel> B<rush> does not clean up, if the process
 dies abnormally.
 
 B<rush> has some string manipulations that can be emulated by putting
-this into ~/.parallel/config (% is used instead of ^):
+this into ~/.parallel/config (/ is used instead of %, and % is used
+instead of ^ as that is closer to bash's ${var%postfix}):
 
   --rpl '{:} s:(\.[^/]+)*$::'
   --rpl '{:%([^}]+?)} s:$$1(\.[^/]+)*$::'
   --rpl '{/:%([^}]*?)} s:.*/(.*)$$1(\.[^/]+)*$:$1:'
   --rpl '{/:} s:(.*/)?([^/.]+)(\.[^/]+)*$:$2:'
+  --rpl '{@(.*?)} /$$1/ and $_=$1;'
+
+Here are the examples from B<rush>'s website with the equivalent
+command in GNU B<parallel>.
+
+=head3 1. Simple run, quoting is not necessary
+
+  $ seq 1 3 | rush echo {}
+
+  $ seq 1 3 | parallel echo {}
+
+=head3 2. Read data from file (`-i`)
+
+  $ rush echo {} -i data1.txt -i data2.txt
+
+  $ cat data1.txt data2.txt | parallel echo {}
+
+=head3 3. Keep output order (`-k`)
+
+  $ seq 1 3 | rush 'echo {}' -k
+
+  $ seq 1 3 | parallel -k echo {}
 
 
-Here are the examples from B<rush>'s website:
+=head3 4. Timeout (`-t`)
 
-B<1> seq 1 10 | rush echo {}
+  $ time seq 1 | rush 'sleep 2; echo {}' -t 1
 
-B<1> seq 1 10 | parallel echo {}
+  $ time seq 1 | parallel --timeout 1 'sleep 2; echo {}'
 
-B<2> seq 1 10 | rush 'echo {}' -k
+=head3 5. Retry (`-r`)
 
-B<2> seq 1 10 | parallel -k 'echo {}'
+  $ seq 1 | rush 'python unexisted_script.py' -r 1
 
-B<3> seq 1 | rush 'sleep 2; echo {}' -t 1
+  $ seq 1 | parallel --retries 2 'python unexisted_script.py'
 
-B<3> seq 1 | parallel --timeout 1 'sleep 2; echo {}'
+Use B<-u> to see it is really run twice:
 
-B<4> seq 1 | rush 'python script.py' -r 3
+  $ seq 1 | parallel -u --retries 2 'python unexisted_script.py'
 
-B<4> seq 1 | parallel --retries 4 'python script.py'
+=head3 6. Dirname (`{/}`) and basename (`{%}`) and remove custom
+suffix (`{^suffix}`)
 
-B<5> echo dir/file_1.txt.gz | rush 'echo {/} {%} {^_1.txt.gz}'
+  $ echo dir/file_1.txt.gz | rush 'echo {/} {%} {^_1.txt.gz}'
 
-B<5> echo dir/file_1.txt.gz | parallel --plus 'echo {//} {/} {%_1.txt.gz}'
+  $ echo dir/file_1.txt.gz |
+      parallel --plus echo {//} {/} {%_1.txt.gz}
 
-B<6> echo dir.d/file.txt.gz | rush 'echo {.} {:} {%.} {%:}'
+=head3 7. Get basename, and remove last (`{.}`) or any (`{:}`) extension
 
-B<6> echo dir.d/file.txt.gz | parallel 'echo {.} {:} {/.} {/:}'
+  $ echo dir.d/file.txt.gz | rush 'echo {.} {:} {%.} {%:}'
 
-B<7> echo 12 file.txt dir/s_1.fq.gz | rush 'echo job {#}: {2} {2.} {3%:^_1}'
+  $ echo dir.d/file.txt.gz | parallel 'echo {.} {:} {/.} {/:}'
 
-B<7> echo 12 file.txt dir/s_1.fq.gz | parallel --colsep ' ' 'echo job {#}: {2} {2.} {3/:%_1}'
+=head3 8. Job ID, combine fields index and other replacement strings
 
-B<8> echo a=b=c | rush 'echo {1} {2} {3}' -d =
+  $ echo 12 file.txt dir/s_1.fq.gz |
+      rush 'echo job {#}: {2} {2.} {3%:^_1}'
 
-B<8> echo a=b=c | parallel --colsep = 'echo {1} {2} {3}'
+  $ echo 12 file.txt dir/s_1.fq.gz |
+      parallel --colsep ' ' 'echo job {#}: {2} {2.} {3/:%_1}'
 
-B<9> echo a=b=c | rush -D "=" -k 'echo {}'
+=head3 9. Capture submatch using regular expression (`{@regexp}`)
 
-B<9> echo -n a=b=c | parallel -d "=" -k 'echo {}'
+  $ echo read_1.fq.gz | rush 'echo {@(.+)_\d}'
 
-B<9a> echo abc | rush -D "" -k 'echo {}'
+  $ echo read_1.fq.gz | parallel 'echo {@(.+)_\d}'
 
-B<9a> echo -n abc | parallel --pipe --recend '' --block 1 -k parallel echo
+=head3 10. Custom field delimiter (`-d`)
 
-B<10> seq 1 | rush 'echo Hello, {fname} {lname}!' -v fname=Wei -v lname=Shen
+  $ echo a=b=c | rush 'echo {1} {2} {3}' -d =
 
-B<10> seq 1 | parallel -N0 'fname=Wei; lname=Shen; echo Hello, ${fname} ${lname}!'
+  $ echo a=b=c | parallel -d = echo {1} {2} {3}
 
-B<11> echo read_1.fq.gz | rush -v p={:^_1} 'echo {p} {p}_2.fq.gz'
+=head3 11. Send multi-lines to every command (`-n`)
 
-B<11> echo read_1.fq.gz | parallel 'p={:%_1}; echo ${p} ${p}_2.fq.gz'
+  $ seq 5 | rush -n 2 -k 'echo "{}"; echo'
 
-B<12> seq 1 3 | rush 'sleep {}; echo {}' -c -t 2
+  $ seq 5 |
+      parallel -n 2 -k \
+        'echo {=-1 $_=join"\n",@arg[1..$#arg] =}; echo'
 
-B<12> seq 1 3 | parallel --joblog mylog --timeout 2 'sleep {}; echo {}'
+  $ seq 5 | rush -n 2 -k 'echo "{}"; echo' -J ' '
 
-B<12> Followed by:
+  $ seq 5 | parallel -n 2 -k 'echo {}; echo'
 
-B<12> seq 1 3 | parallel --joblog mylog --retry-failed 'sleep {}; echo {}'
+
+=head3 12. Custom record delimiter (`-D`), note that empty records are not used.
+
+  $ echo a b c d | rush -D " " -k 'echo {}'
+
+  $ echo a b c d | parallel -d " " -k 'echo {}'
+
+  $ echo abcd | rush -D "" -k 'echo {}'
+
+  Cannot be done by GNU Parallel
+
+  $ cat fasta.fa
+  >seq1
+  tag
+  >seq2
+  cat
+  gat
+  >seq3
+  attac
+  a
+  cat
+
+  $ cat fasta.fa | rush -D ">" \
+      'echo FASTA record {#}: name: {1} sequence: {2}' -k -d "\n"
+  # rush fails to join the multiline sequences
+
+  $ cat fasta.fa | (read -n1 ignore_first_char;
+      parallel -d '>' --colsep '\n' echo FASTA record {#}: \
+        name: {1} sequence: '{=2 $_=join"",@arg[2..$#arg]=}'
+    )
+
+=head3 13. Assign value to variable, like `awk -v` (`-v`)
+
+  $ seq 1 |
+      rush 'echo Hello, {fname} {lname}!' -v fname=Wei -v lname=Shen
+
+  $ seq 1 |
+      parallel -N0 \
+        'fname=Wei; lname=Shen; echo Hello, ${fname} ${lname}!'
+
+  $ for var in a b; do \
+  $   seq 1 3 | rush -k -v var=$var 'echo var: {var}, data: {}'; \
+  $ done
+
+In GNU B<parallel> you would typically do:
+
+  $ seq 1 3 | parallel -k echo var: {1}, data: {2} ::: a b :::: -
+
+If you I<really> want the var:
+
+  $ seq 1 3 |
+      parallel -k var={1} ';echo var: $var, data: {}' ::: a b :::: -
+
+If you I<really> want the B<for>-loop:
+
+  $ for var in a b; do
+  >   export var;
+  >   seq 1 3 | parallel -k 'echo var: $var, data: {}';
+  > done
+
+Contrary to B<rush> this also works if the value is complex like:
+
+  My brother's 12" records
+
+
+=head3 14. B<Preset variable> (`-v`), avoid repeatedly writing verbose replacement strings
+
+  # naive way
+  $ echo read_1.fq.gz | rush 'echo {:^_1} {:^_1}_2.fq.gz'
+
+  $ echo read_1.fq.gz | parallel 'echo {:%_1} {:%_1}_2.fq.gz'
+
+  # macro + removing suffix
+  $ echo read_1.fq.gz |
+      rush -v p='{:^_1}' 'echo {p} {p}_2.fq.gz'
+
+  $ echo read_1.fq.gz |
+      parallel 'p={:%_1}; echo $p ${p}_2.fq.gz'
+
+  # macro + regular expression
+  $ echo read_1.fq.gz | rush -v p='{@(.+?)_\d}' 'echo {p} {p}_2.fq.gz'
+
+  $ echo read_1.fq.gz | parallel 'p={@(.+?)_\d}; echo $p ${p}_2.fq.gz'
+
+Contrary to B<rush> GNU B<parallel> works with complex values:
+
+  echo "My brother's 12\"read_1.fq.gz" |
+    parallel 'p={@(.+?)_\d}; echo $p ${p}_2.fq.gz'
+
+=head3 15. Interrupt jobs by `Ctrl-C`, rush will stop unfinished
+commands and exit.
+
+  $ seq 1 20 | rush 'sleep 1; echo {}'
+  ^C
+
+  $ seq 1 20 | parallel 'sleep 1; echo {}'
+  ^C
+
+=head3 16. Continue/resume jobs (`-c`). When some jobs failed (by
+execution failure, timeout, or cancelling by user with `Ctrl + C`),
+please switch flag `-c/--continue` on and run again, so that `rush`
+can save successful commands and ignore them in **NEXT** run.
+
+  $ seq 1 3 | rush 'sleep {}; echo {}' -t 3 -c
+  $ cat successful_cmds.rush
+  $ seq 1 3 | rush 'sleep {}; echo {}' -t 3 -c
+
+  $ seq 1 3 | parallel --joblog mylog --timeout 2 \
+      'sleep {}; echo {}'
+  $ cat mylog
+  $ seq 1 3 | parallel --joblog mylog --retry-failed \
+      'sleep {}; echo {}'
+
+Multi-line jobs:
+
+  $ seq 1 3 | rush 'sleep {}; echo {}; \
+    echo finish {}' -t 3 -c -C finished.rush
+  $ cat finished.rush
+  $ seq 1 3 | rush 'sleep {}; echo {}; \
+    echo finish {}' -t 3 -c -C finished.rush
+
+  $ seq 1 3 |
+      parallel --joblog mylog --timeout 2 'sleep {}; echo {}; \
+    echo finish {}'
+  $ cat mylog
+  $ seq 1 3 |
+      parallel --joblog mylog --retry-failed 'sleep {}; echo {}; \
+        echo finish {}'
+
+=head3 17. A comprehensive example: downloading 1K+ pages given by
+three URL list files using `phantomjs save_page.js` (some page
+contents are dynamicly generated by Javascript, so `wget` does not
+work). Here I set max jobs number (`-j`) as `20`, each job has a max
+running time (`-t`) of `60` seconds and `3` retry changes
+(`-r`). Continue flag `-c` is also switched on, so we can continue
+unfinished jobs. Luckily, it's accomplished in one run :)
+
+  $ for f in $(seq 2014 2016); do \
+  $    /bin/rm -rf $f; mkdir -p $f; \
+  $    cat $f.html.txt | rush -v d=$f -d = \
+         'phantomjs save_page.js "{}" > {d}/{3}.html' \
+         -j 20 -t 60 -r 3 -c; \
+  $ done
+
+GNU B<parallel> can append to an existing joblog with '+':
+
+  $ rm mylog
+  $ for f in $(seq 2014 2016); do
+      /bin/rm -rf $f; mkdir -p $f;
+      cat $f.html.txt |
+        parallel -j20 --timeout 60 --retries 4 --joblog +mylog \
+          --colsep = \
+          phantomjs save_page.js {1}={2}={3} '>' $f/{3}.html
+    done
+
+=head3 18. A bioinformatics example: mapping with `bwa`, and
+processing result with `samtools`:
+
+  $ ref=ref/xxx.fa
+  $ threads=25
+  $ ls -d raw.cluster.clean.mapping/* \
+    | rush -v ref=$ref -v j=$threads -v p='{}/{%}' \
+        'bwa mem -t {j} -M -a {ref} {p}_1.fq.gz {p}_2.fq.gz > {p}.sam; \
+        samtools view -bS {p}.sam > {p}.bam; \
+        samtools sort -T {p}.tmp -@ {j} {p}.bam -o {p}.sorted.bam; \
+        samtools index {p}.sorted.bam; \
+        samtools flagstat {p}.sorted.bam > {p}.sorted.bam.flagstat; \
+        /bin/rm {p}.bam {p}.sam;' \
+        -j 2 --verbose -c -C mapping.rush
+
+GNU B<parallel> would use a function:
+
+  $ ref=ref/xxx.fa
+  $ export ref
+  $ thr=25
+  $ export thr
+  $ bwa_sam() {
+      p="$1"
+      bam="$p".bam
+      sam="$p".sam
+      sortbam="$p".sorted.bam
+      bwa mem -t $thr -M -a $ref ${p}_1.fq.gz ${p}_2.fq.gz > "$sam"
+      samtools view -bS "$sam" > "$bam"
+      samtools sort -T ${p}.tmp -@ $thr "$bam" -o "$sortbam"
+      samtools index "$sortbam"
+      samtools flagstat "$sortbam" > "$sortbam".flagstat
+      /bin/rm "$bam" "$sam"
+    }
+  $ export -f bwa_sam
+  $ ls -d raw.cluster.clean.mapping/* |
+      parallel -j 2 --verbose --joblog mylog bwa_sam
+
+=head3 Other B<rush> features
 
 B<rush> has:
 
@@ -978,6 +1199,13 @@ With GNU B<parallel> this can be emulated by:
 
   parallel --plus echo '{%.bar.gz}' ::: foo.ext.bar.gz
 
+=item {@regexp}, capture submatch using regular expression
+
+With GNU B<parallel> this can be emulated by:
+
+  parallel --rpl '{@(.*?)} /$$1/ and $_=$1;' \
+    echo '{@\d_(.*).gz}' ::: 1_foo.gz
+
 =item {%.}, {%:}, basename without extension
 
 With GNU B<parallel> this can be emulated by:
@@ -1015,7 +1243,8 @@ double space, ' and ":
 
 =item * Commands of multi-lines
 
-To improve readibilty GNU B<parallel> encourages not to use multi-line
+While you I<can> use multi-lined commands in GNU B<parallel>, to
+improve readibilty GNU B<parallel> encourages not to use multi-line
 commands. In most cases it can be written as a function:
 
   seq 1 3 | parallel --timeout 2 --joblog my.log 'sleep {}; echo {}; \
diff --git a/src/parallel_design.pod b/src/parallel_design.pod
index 09749a91..ae2c116f 100644
--- a/src/parallel_design.pod
+++ b/src/parallel_design.pod
@@ -1077,7 +1077,7 @@ users:
 https://lists.gnu.org/archive/html/parallel/2013-11/msg00006.html
 
 There is no doubt that this is not an ideal solution, but no one has
-so far come up with an ideal solution - neither for maintaining GNU
+so far come up with an ideal solution - neither for funding GNU
 B<parallel> nor other free software.
 
 If you believe you have the perfect solution, you should try it out,