From bbd336643cf14301dcf5e5eceabc0579dd9d821e Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Sat, 29 Jul 2017 23:49:00 +0200 Subject: [PATCH] parallel: Small bug in {#prefix} replacement string. --- doc/release_new_version | 27 +--- src/parallel | 11 +- src/parallel_alternatives.pod | 291 ++++++++++++++++++++++++++++++---- src/parallel_design.pod | 2 +- 4 files changed, 276 insertions(+), 55 deletions(-) diff --git a/doc/release_new_version b/doc/release_new_version index 597f538d..e8e97c07 100644 --- a/doc/release_new_version +++ b/doc/release_new_version @@ -196,17 +196,16 @@ file:///home/tange/privat/parallel/doc/release_new_version from:tange@gnu.org to:parallel@gnu.org, bug-parallel@gnu.org -Subject: GNU Parallel 20170622 ('Grenfell') released <<[stable]>> +Subject: GNU Parallel 20170822 ('<<>>') released <<[stable]>> -GNU Parallel 20170622 ('Grenfell') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/ +GNU Parallel 20170822 ('<<>>') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/ <> -Quote of the month: +Haiku of the month: - I don't care - I just need to get shit done - -- Sab + <<>> + New in this release: @@ -215,21 +214,9 @@ New in this release: http://meta.askubuntu.com/a/16750/22307 http://meta.serverfault.com/a/9040/45704 -* GNU Parallel was cited in: Hayabusa: Simple and Fast Full-Text Search Engine for Massive System Log Data http://dl.acm.org/citation.cfm?id=3095788 +* GNU Parallel was cited in: -* コマンドの並列化を行える『GNU parallel』の個人的使い方まとめhttps://orebibou.com/2017/07/%E3%82%B3%E3%83%9E%E3%83%B3%E3%83%89%E3%81%AE%E4%B8%A6%E5%88%97%E5%8C%96%E3%82%92%E8%A1%8C%E3%81%88%E3%82%8B%E3%80%8Egnu-parallel%E3%80%8F%E3%81%AE%E5%80%8B%E4%BA%BA%E7%9A%84%E4%BD%BF%E3%81%84/ - -* https://blog.archive.org/2017/07/10/how-to-play-and-play-with-78rpm-record-transfers/ - -* https://gxnotes.com/article/130363.html - -* https://sgillies.net/2017/05/18/rfc-8142-geojson-text-sequences.html - -* https://lukas.zapletalovi.com/2017/07/git-auto-fetch-script-i-run-every-day.html - -* http://crazyhottommy.blogspot.de/2017/07/cores-cpus-and-threads.html - -' https://lukas.zapletalovi.com/2017/07/git-auto-fetch-script-i-run-every-day.html +* https://medium.com/@nornagon/today-i-learned-gnu-parallel-plate-tectonics-9fcf24045e63 <> diff --git a/src/parallel b/src/parallel index 2bd86e82..b7d2384b 100755 --- a/src/parallel +++ b/src/parallel @@ -1042,6 +1042,7 @@ sub options_hash { "internal-pipe-means-argfiles" => \$opt::internal_pipe_means_argfiles, "Y" => \$opt::retired, "skip-first-line" => \$opt::skip_first_line, + "bug" => \$opt::bug, "header=s" => \$opt::header, "cat" => \$opt::cat, "fifo" => \$opt::fifo, @@ -1102,6 +1103,7 @@ sub parse_options { if($opt::nokeeporder) { $opt::keeporder = undef; } if(@opt::v) { $Global::verbose = $#opt::v+1; } # Convert -v -v to v=2 + if($opt::bug) { ::die_bug("test-bug"); } $Global::debug = $opt::D; $Global::shell = $ENV{'PARALLEL_SHELL'} || parent_shell($$) || $ENV{'SHELL'} || "/bin/sh"; @@ -1419,7 +1421,7 @@ sub init_globals { # Bash ${a:2:3} '{:(\d+?):(\d+?)}' => '$_ = substr($_,$$1,$$2);', # Bash ${a#bc} - '{#([^#][^}]*?)}' => 's/^$$1//;', + '{#([^#}][^}]*?)}' => 's/^$$1//;', # Bash ${a%def} '{%([^}]+?)}' => 's/$$1$//;', # Bash ${a/def/ghi} ${a/def/} @@ -3674,7 +3676,7 @@ sub onall { ((defined $opt::timeout) ? "--timeout ".$opt::timeout : ""), ((defined $opt::ungroup) ? "-u" : ""), ((defined $opt::tee) ? "--tee" : ""), - ((defined $opt::workdir) ? "--wd ".$opt::workdir : ""), + ((defined $opt::workdir) ? "--wd ".::shell_quote_scalar($opt::workdir) : ""), (@Global::transfer_files ? map { "--tf ".::shell_quote_scalar($_) } @Global::transfer_files : ""), (@Global::ret_files ? map { "--return ".::shell_quote_scalar($_) } @@ -4089,7 +4091,10 @@ sub die_bug { my $bugid = shift; print STDERR ("$Global::progname: This should not happen. You have found a bug.\n", - "Please contact and include:\n", + "Please contact and follow\n", + "https://www.gnu.org/software/parallel/man.html#REPORTING-BUGS\n", + "\n", + "Include this in the report:\n", "* The version number: $Global::version\n", "* The bugid: $bugid\n", "* The command line being run\n", diff --git a/src/parallel_alternatives.pod b/src/parallel_alternatives.pod index 4a4da73e..893e85ed 100644 --- a/src/parallel_alternatives.pod +++ b/src/parallel_alternatives.pod @@ -866,71 +866,292 @@ opposite GNU B B does not clean up, if the process dies abnormally. B has some string manipulations that can be emulated by putting -this into ~/.parallel/config (% is used instead of ^): +this into ~/.parallel/config (/ is used instead of %, and % is used +instead of ^ as that is closer to bash's ${var%postfix}): --rpl '{:} s:(\.[^/]+)*$::' --rpl '{:%([^}]+?)} s:$$1(\.[^/]+)*$::' --rpl '{/:%([^}]*?)} s:.*/(.*)$$1(\.[^/]+)*$:$1:' --rpl '{/:} s:(.*/)?([^/.]+)(\.[^/]+)*$:$2:' + --rpl '{@(.*?)} /$$1/ and $_=$1;' + +Here are the examples from B's website with the equivalent +command in GNU B. + +=head3 1. Simple run, quoting is not necessary + + $ seq 1 3 | rush echo {} + + $ seq 1 3 | parallel echo {} + +=head3 2. Read data from file (`-i`) + + $ rush echo {} -i data1.txt -i data2.txt + + $ cat data1.txt data2.txt | parallel echo {} + +=head3 3. Keep output order (`-k`) + + $ seq 1 3 | rush 'echo {}' -k + + $ seq 1 3 | parallel -k echo {} -Here are the examples from B's website: +=head3 4. Timeout (`-t`) -B<1> seq 1 10 | rush echo {} + $ time seq 1 | rush 'sleep 2; echo {}' -t 1 -B<1> seq 1 10 | parallel echo {} + $ time seq 1 | parallel --timeout 1 'sleep 2; echo {}' -B<2> seq 1 10 | rush 'echo {}' -k +=head3 5. Retry (`-r`) -B<2> seq 1 10 | parallel -k 'echo {}' + $ seq 1 | rush 'python unexisted_script.py' -r 1 -B<3> seq 1 | rush 'sleep 2; echo {}' -t 1 + $ seq 1 | parallel --retries 2 'python unexisted_script.py' -B<3> seq 1 | parallel --timeout 1 'sleep 2; echo {}' +Use B<-u> to see it is really run twice: -B<4> seq 1 | rush 'python script.py' -r 3 + $ seq 1 | parallel -u --retries 2 'python unexisted_script.py' -B<4> seq 1 | parallel --retries 4 'python script.py' +=head3 6. Dirname (`{/}`) and basename (`{%}`) and remove custom +suffix (`{^suffix}`) -B<5> echo dir/file_1.txt.gz | rush 'echo {/} {%} {^_1.txt.gz}' + $ echo dir/file_1.txt.gz | rush 'echo {/} {%} {^_1.txt.gz}' -B<5> echo dir/file_1.txt.gz | parallel --plus 'echo {//} {/} {%_1.txt.gz}' + $ echo dir/file_1.txt.gz | + parallel --plus echo {//} {/} {%_1.txt.gz} -B<6> echo dir.d/file.txt.gz | rush 'echo {.} {:} {%.} {%:}' +=head3 7. Get basename, and remove last (`{.}`) or any (`{:}`) extension -B<6> echo dir.d/file.txt.gz | parallel 'echo {.} {:} {/.} {/:}' + $ echo dir.d/file.txt.gz | rush 'echo {.} {:} {%.} {%:}' -B<7> echo 12 file.txt dir/s_1.fq.gz | rush 'echo job {#}: {2} {2.} {3%:^_1}' + $ echo dir.d/file.txt.gz | parallel 'echo {.} {:} {/.} {/:}' -B<7> echo 12 file.txt dir/s_1.fq.gz | parallel --colsep ' ' 'echo job {#}: {2} {2.} {3/:%_1}' +=head3 8. Job ID, combine fields index and other replacement strings -B<8> echo a=b=c | rush 'echo {1} {2} {3}' -d = + $ echo 12 file.txt dir/s_1.fq.gz | + rush 'echo job {#}: {2} {2.} {3%:^_1}' -B<8> echo a=b=c | parallel --colsep = 'echo {1} {2} {3}' + $ echo 12 file.txt dir/s_1.fq.gz | + parallel --colsep ' ' 'echo job {#}: {2} {2.} {3/:%_1}' -B<9> echo a=b=c | rush -D "=" -k 'echo {}' +=head3 9. Capture submatch using regular expression (`{@regexp}`) -B<9> echo -n a=b=c | parallel -d "=" -k 'echo {}' + $ echo read_1.fq.gz | rush 'echo {@(.+)_\d}' -B<9a> echo abc | rush -D "" -k 'echo {}' + $ echo read_1.fq.gz | parallel 'echo {@(.+)_\d}' -B<9a> echo -n abc | parallel --pipe --recend '' --block 1 -k parallel echo +=head3 10. Custom field delimiter (`-d`) -B<10> seq 1 | rush 'echo Hello, {fname} {lname}!' -v fname=Wei -v lname=Shen + $ echo a=b=c | rush 'echo {1} {2} {3}' -d = -B<10> seq 1 | parallel -N0 'fname=Wei; lname=Shen; echo Hello, ${fname} ${lname}!' + $ echo a=b=c | parallel -d = echo {1} {2} {3} -B<11> echo read_1.fq.gz | rush -v p={:^_1} 'echo {p} {p}_2.fq.gz' +=head3 11. Send multi-lines to every command (`-n`) -B<11> echo read_1.fq.gz | parallel 'p={:%_1}; echo ${p} ${p}_2.fq.gz' + $ seq 5 | rush -n 2 -k 'echo "{}"; echo' -B<12> seq 1 3 | rush 'sleep {}; echo {}' -c -t 2 + $ seq 5 | + parallel -n 2 -k \ + 'echo {=-1 $_=join"\n",@arg[1..$#arg] =}; echo' -B<12> seq 1 3 | parallel --joblog mylog --timeout 2 'sleep {}; echo {}' + $ seq 5 | rush -n 2 -k 'echo "{}"; echo' -J ' ' -B<12> Followed by: + $ seq 5 | parallel -n 2 -k 'echo {}; echo' -B<12> seq 1 3 | parallel --joblog mylog --retry-failed 'sleep {}; echo {}' + +=head3 12. Custom record delimiter (`-D`), note that empty records are not used. + + $ echo a b c d | rush -D " " -k 'echo {}' + + $ echo a b c d | parallel -d " " -k 'echo {}' + + $ echo abcd | rush -D "" -k 'echo {}' + + Cannot be done by GNU Parallel + + $ cat fasta.fa + >seq1 + tag + >seq2 + cat + gat + >seq3 + attac + a + cat + + $ cat fasta.fa | rush -D ">" \ + 'echo FASTA record {#}: name: {1} sequence: {2}' -k -d "\n" + # rush fails to join the multiline sequences + + $ cat fasta.fa | (read -n1 ignore_first_char; + parallel -d '>' --colsep '\n' echo FASTA record {#}: \ + name: {1} sequence: '{=2 $_=join"",@arg[2..$#arg]=}' + ) + +=head3 13. Assign value to variable, like `awk -v` (`-v`) + + $ seq 1 | + rush 'echo Hello, {fname} {lname}!' -v fname=Wei -v lname=Shen + + $ seq 1 | + parallel -N0 \ + 'fname=Wei; lname=Shen; echo Hello, ${fname} ${lname}!' + + $ for var in a b; do \ + $ seq 1 3 | rush -k -v var=$var 'echo var: {var}, data: {}'; \ + $ done + +In GNU B you would typically do: + + $ seq 1 3 | parallel -k echo var: {1}, data: {2} ::: a b :::: - + +If you I want the var: + + $ seq 1 3 | + parallel -k var={1} ';echo var: $var, data: {}' ::: a b :::: - + +If you I want the B-loop: + + $ for var in a b; do + > export var; + > seq 1 3 | parallel -k 'echo var: $var, data: {}'; + > done + +Contrary to B this also works if the value is complex like: + + My brother's 12" records + + +=head3 14. B (`-v`), avoid repeatedly writing verbose replacement strings + + # naive way + $ echo read_1.fq.gz | rush 'echo {:^_1} {:^_1}_2.fq.gz' + + $ echo read_1.fq.gz | parallel 'echo {:%_1} {:%_1}_2.fq.gz' + + # macro + removing suffix + $ echo read_1.fq.gz | + rush -v p='{:^_1}' 'echo {p} {p}_2.fq.gz' + + $ echo read_1.fq.gz | + parallel 'p={:%_1}; echo $p ${p}_2.fq.gz' + + # macro + regular expression + $ echo read_1.fq.gz | rush -v p='{@(.+?)_\d}' 'echo {p} {p}_2.fq.gz' + + $ echo read_1.fq.gz | parallel 'p={@(.+?)_\d}; echo $p ${p}_2.fq.gz' + +Contrary to B GNU B works with complex values: + + echo "My brother's 12\"read_1.fq.gz" | + parallel 'p={@(.+?)_\d}; echo $p ${p}_2.fq.gz' + +=head3 15. Interrupt jobs by `Ctrl-C`, rush will stop unfinished +commands and exit. + + $ seq 1 20 | rush 'sleep 1; echo {}' + ^C + + $ seq 1 20 | parallel 'sleep 1; echo {}' + ^C + +=head3 16. Continue/resume jobs (`-c`). When some jobs failed (by +execution failure, timeout, or cancelling by user with `Ctrl + C`), +please switch flag `-c/--continue` on and run again, so that `rush` +can save successful commands and ignore them in **NEXT** run. + + $ seq 1 3 | rush 'sleep {}; echo {}' -t 3 -c + $ cat successful_cmds.rush + $ seq 1 3 | rush 'sleep {}; echo {}' -t 3 -c + + $ seq 1 3 | parallel --joblog mylog --timeout 2 \ + 'sleep {}; echo {}' + $ cat mylog + $ seq 1 3 | parallel --joblog mylog --retry-failed \ + 'sleep {}; echo {}' + +Multi-line jobs: + + $ seq 1 3 | rush 'sleep {}; echo {}; \ + echo finish {}' -t 3 -c -C finished.rush + $ cat finished.rush + $ seq 1 3 | rush 'sleep {}; echo {}; \ + echo finish {}' -t 3 -c -C finished.rush + + $ seq 1 3 | + parallel --joblog mylog --timeout 2 'sleep {}; echo {}; \ + echo finish {}' + $ cat mylog + $ seq 1 3 | + parallel --joblog mylog --retry-failed 'sleep {}; echo {}; \ + echo finish {}' + +=head3 17. A comprehensive example: downloading 1K+ pages given by +three URL list files using `phantomjs save_page.js` (some page +contents are dynamicly generated by Javascript, so `wget` does not +work). Here I set max jobs number (`-j`) as `20`, each job has a max +running time (`-t`) of `60` seconds and `3` retry changes +(`-r`). Continue flag `-c` is also switched on, so we can continue +unfinished jobs. Luckily, it's accomplished in one run :) + + $ for f in $(seq 2014 2016); do \ + $ /bin/rm -rf $f; mkdir -p $f; \ + $ cat $f.html.txt | rush -v d=$f -d = \ + 'phantomjs save_page.js "{}" > {d}/{3}.html' \ + -j 20 -t 60 -r 3 -c; \ + $ done + +GNU B can append to an existing joblog with '+': + + $ rm mylog + $ for f in $(seq 2014 2016); do + /bin/rm -rf $f; mkdir -p $f; + cat $f.html.txt | + parallel -j20 --timeout 60 --retries 4 --joblog +mylog \ + --colsep = \ + phantomjs save_page.js {1}={2}={3} '>' $f/{3}.html + done + +=head3 18. A bioinformatics example: mapping with `bwa`, and +processing result with `samtools`: + + $ ref=ref/xxx.fa + $ threads=25 + $ ls -d raw.cluster.clean.mapping/* \ + | rush -v ref=$ref -v j=$threads -v p='{}/{%}' \ + 'bwa mem -t {j} -M -a {ref} {p}_1.fq.gz {p}_2.fq.gz > {p}.sam; \ + samtools view -bS {p}.sam > {p}.bam; \ + samtools sort -T {p}.tmp -@ {j} {p}.bam -o {p}.sorted.bam; \ + samtools index {p}.sorted.bam; \ + samtools flagstat {p}.sorted.bam > {p}.sorted.bam.flagstat; \ + /bin/rm {p}.bam {p}.sam;' \ + -j 2 --verbose -c -C mapping.rush + +GNU B would use a function: + + $ ref=ref/xxx.fa + $ export ref + $ thr=25 + $ export thr + $ bwa_sam() { + p="$1" + bam="$p".bam + sam="$p".sam + sortbam="$p".sorted.bam + bwa mem -t $thr -M -a $ref ${p}_1.fq.gz ${p}_2.fq.gz > "$sam" + samtools view -bS "$sam" > "$bam" + samtools sort -T ${p}.tmp -@ $thr "$bam" -o "$sortbam" + samtools index "$sortbam" + samtools flagstat "$sortbam" > "$sortbam".flagstat + /bin/rm "$bam" "$sam" + } + $ export -f bwa_sam + $ ls -d raw.cluster.clean.mapping/* | + parallel -j 2 --verbose --joblog mylog bwa_sam + +=head3 Other B features B has: @@ -978,6 +1199,13 @@ With GNU B this can be emulated by: parallel --plus echo '{%.bar.gz}' ::: foo.ext.bar.gz +=item {@regexp}, capture submatch using regular expression + +With GNU B this can be emulated by: + + parallel --rpl '{@(.*?)} /$$1/ and $_=$1;' \ + echo '{@\d_(.*).gz}' ::: 1_foo.gz + =item {%.}, {%:}, basename without extension With GNU B this can be emulated by: @@ -1015,7 +1243,8 @@ double space, ' and ": =item * Commands of multi-lines -To improve readibilty GNU B encourages not to use multi-line +While you I use multi-lined commands in GNU B, to +improve readibilty GNU B encourages not to use multi-line commands. In most cases it can be written as a function: seq 1 3 | parallel --timeout 2 --joblog my.log 'sleep {}; echo {}; \ diff --git a/src/parallel_design.pod b/src/parallel_design.pod index 09749a91..ae2c116f 100644 --- a/src/parallel_design.pod +++ b/src/parallel_design.pod @@ -1077,7 +1077,7 @@ users: https://lists.gnu.org/archive/html/parallel/2013-11/msg00006.html There is no doubt that this is not an ideal solution, but no one has -so far come up with an ideal solution - neither for maintaining GNU +so far come up with an ideal solution - neither for funding GNU B nor other free software. If you believe you have the perfect solution, you should try it out,