mirror of
https://git.savannah.gnu.org/git/parallel.git
synced 2024-11-22 05:57:54 +00:00
parallel: Note if \257 is used in any string that can contain replacement strings.
man pages: Lots of updates.
This commit is contained in:
parent
10b4539b1f
commit
1176f9a058
|
@ -204,8 +204,8 @@ GNU Parallel 20170622 ('Grenfell') <<[stable]>> has been released. It is availab
|
|||
|
||||
Haiku of the month:
|
||||
|
||||
<<>>
|
||||
-- Ole Tange
|
||||
I don't care
|
||||
I just need to get shit done
|
||||
|
||||
New in this release:
|
||||
|
||||
|
@ -218,6 +218,18 @@ New in this release:
|
|||
|
||||
* GNU Parallel was used in: https://libraries.io/rubygems/aai
|
||||
|
||||
* コマンドの並列化を行える『GNU parallel』の個人的使い方まとめhttps://orebibou.com/2017/07/%E3%82%B3%E3%83%9E%E3%83%B3%E3%83%89%E3%81%AE%E4%B8%A6%E5%88%97%E5%8C%96%E3%82%92%E8%A1%8C%E3%81%88%E3%82%8B%E3%80%8Egnu-parallel%E3%80%8F%E3%81%AE%E5%80%8B%E4%BA%BA%E7%9A%84%E4%BD%BF%E3%81%84/
|
||||
|
||||
* https://blog.archive.org/2017/07/10/how-to-play-and-play-with-78rpm-record-transfers/
|
||||
|
||||
* https://gxnotes.com/article/130363.html
|
||||
|
||||
* https://sgillies.net/2017/05/18/rfc-8142-geojson-text-sequences.html
|
||||
|
||||
* https://lukas.zapletalovi.com/2017/07/git-auto-fetch-script-i-run-every-day.html
|
||||
|
||||
* http://crazyhottommy.blogspot.de/2017/07/cores-cpus-and-threads.html
|
||||
|
||||
<<Citation not OK: BAMClipper: removing primers from alignments to minimize false-negative mutations in amplicon next-generation sequencing https://www.nature.com/articles/s41598-017-01703-6>>
|
||||
|
||||
<<Wrong citation https://iris.sissa.it/retrieve/handle/20.500.11767/36149/10823/And%C3%B2_tesi.pdf>>
|
||||
|
|
|
@ -98,7 +98,7 @@ env_parallel() {
|
|||
}
|
||||
|
||||
|
||||
# Bash 'which' is broken in version 3.2.25 and 4.2.39
|
||||
# Bash is broken in version 3.2.25 and 4.2.39
|
||||
# The crazy '[ "`...`" == "" ]' is needed for the same reason
|
||||
if [ "`which parallel`" == "" ]; then
|
||||
echo 'env_parallel: Error: parallel must be in $PATH.' >&2
|
||||
|
@ -112,6 +112,8 @@ env_parallel() {
|
|||
_ignore_UNDERSCORE="`_get_ignored_VARS \"$@\"`"
|
||||
|
||||
# --record-env
|
||||
# Bash is broken in version 3.2.25 and 4.2.39
|
||||
# The crazy '[ "`...`" == 0 ]' is needed for the same reason
|
||||
if [ "`perl -e 'exit grep { /^--record-env$/ } @ARGV' -- "$@"; echo $?`" == 0 ] ; then
|
||||
true skip
|
||||
else
|
||||
|
|
|
@ -1376,7 +1376,7 @@ sub check_invalid_option_combinations {
|
|||
|
||||
sub init_globals {
|
||||
# Defaults:
|
||||
$Global::version = 20170706;
|
||||
$Global::version = 20170707;
|
||||
$Global::progname = 'parallel';
|
||||
$Global::infinity = 2**31;
|
||||
$Global::debug = 0;
|
||||
|
@ -9598,7 +9598,7 @@ sub new {
|
|||
# Skip if undefined
|
||||
$_ or next;
|
||||
# Escape \257 => \257\256
|
||||
$Global::escape_string_present = s/\257/\257\256/g;
|
||||
$Global::escape_string_present += s/\257/\257\256/g;
|
||||
# Needs to match rightmost left parens (Perl defaults to leftmost)
|
||||
# to deal with: {={==} and {={==}=}
|
||||
# Replace {= -> \257< and =} -> \257>
|
||||
|
|
|
@ -1238,7 +1238,7 @@ B<--pipepart> has a few limitations:
|
|||
|
||||
=over 3
|
||||
|
||||
=item Z<>*
|
||||
=item *
|
||||
|
||||
The file must be a normal file or a block device (technically it must
|
||||
be seekable) and must be given using B<-a> or B<::::>. The file cannot
|
||||
|
@ -1247,7 +1247,7 @@ be a pipe or a fifo as they are not seekable.
|
|||
If using a block device with lot of NUL bytes, remember to set
|
||||
B<--recend ''>.
|
||||
|
||||
=item Z<>*
|
||||
=item *
|
||||
|
||||
Record counting (B<-N>) and line counting (B<-L>/B<-l>) do not work.
|
||||
|
||||
|
@ -4545,6 +4545,9 @@ you can make them by something like B<seq 1000000> > B<file> or B<yes
|
|||
If your example requires remote execution, see if you can use
|
||||
B<localhost> - maybe using another login.
|
||||
|
||||
If you have access to a different system, test if the MCVE shows the
|
||||
problem on that system.
|
||||
|
||||
=item *
|
||||
|
||||
The output of your example. If your problem is not easily reproduced
|
||||
|
|
|
@ -1080,9 +1080,10 @@ There are certain issues that are very common on parallelizing
|
|||
tools. Here are a few stress tests. Be warned: If the tool is badly
|
||||
coded it may overload you machine.
|
||||
|
||||
=head2 Output mixes
|
||||
=head2 A: Output mixes
|
||||
|
||||
Output from 2 jobs should not mix.
|
||||
Output from 2 jobs should not mix. If the tool does not buffer, output
|
||||
will most likely mix.
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
|
@ -1105,26 +1106,7 @@ Output from 2 jobs should not mix.
|
|||
# 'a b c' should always stay together
|
||||
# and there should only be a single line per job
|
||||
|
||||
=head2 Speed depends on number of words
|
||||
|
||||
Some tools become very slow if output lines have many words.
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
paralleltool=parallel
|
||||
|
||||
cat <<-EOF > mycommand
|
||||
#!/bin/bash
|
||||
|
||||
# 10 MB of lines with 1000 words
|
||||
yes "`seq 1000`" | head -c 10M
|
||||
EOF
|
||||
chmod +x mycommand
|
||||
|
||||
# Run 30 jobs in parallel
|
||||
seq 30 | time $paralleltool -j0 ./mycommand > /dev/null
|
||||
|
||||
=head2 Output limited by RAM
|
||||
=head2 B: Output limited by RAM
|
||||
|
||||
Some tools cache output in RAM. This makes them extremely slow if the
|
||||
output is bigger than physical memory and crash if the the output is
|
||||
|
@ -1146,9 +1128,10 @@ bigger than the virtual memory.
|
|||
# Adjust 20 to be > physical RAM and < free space on /tmp
|
||||
seq 20 | time $paralleltool -j0 ./mycommand | wc -c
|
||||
|
||||
=head2 Leaving tmp files at unexpected death
|
||||
=head2 C: Leaving tmp files at unexpected death
|
||||
|
||||
Some tools do not clean up tmp files if they are killed.
|
||||
Some tools do not clean up tmp files if they are killed. If the tool
|
||||
buffers on disk, they may not clean up, if they are killed.
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
|
@ -1164,7 +1147,7 @@ Some tools do not clean up tmp files if they are killed.
|
|||
# Should be empty: No files should be left behind
|
||||
diff <(ls /tmp) /tmp/before
|
||||
|
||||
=head2 Dealing badly with special file names.
|
||||
=head2 D: Dealing badly with special file names.
|
||||
|
||||
It is not uncommon for users to create files like:
|
||||
|
||||
|
@ -1179,18 +1162,37 @@ Some tools break on this.
|
|||
touch "My brother's 12\" records cost \$\$\$.txt"
|
||||
ls My*txt | $paralleltool echo
|
||||
|
||||
=head2 Composed commands do not work
|
||||
=head2 E: Composed commands do not work
|
||||
|
||||
Some tools require you to wrap composed commands into B<bash -c>.
|
||||
|
||||
echo bar | $paralleltool echo foo';' echo {}
|
||||
|
||||
=head2 Only one replacement string allowed
|
||||
=head2 F: Only one replacement string allowed
|
||||
|
||||
Some tools can only insert the argument once.
|
||||
|
||||
echo bar | $paralleltool echo {} foo {}
|
||||
|
||||
=head2 G: Speed depends on number of words
|
||||
|
||||
Some tools become very slow if output lines have many words.
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
paralleltool=parallel
|
||||
|
||||
cat <<-EOF > mycommand
|
||||
#!/bin/bash
|
||||
|
||||
# 10 MB of lines with 1000 words
|
||||
yes "`seq 1000`" | head -c 10M
|
||||
EOF
|
||||
chmod +x mycommand
|
||||
|
||||
# Run 30 jobs in parallel
|
||||
seq 30 | time $paralleltool -j0 ./mycommand > /dev/null
|
||||
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
|
|
|
@ -137,6 +137,17 @@ This is then all saved in B<$PARALLEL_ENV>.
|
|||
GNU B<parallel> is called, and B<$PARALLEL_ENV> is deleted.
|
||||
|
||||
|
||||
=head2 parset
|
||||
|
||||
B<parset> is a shell function. This is the reason why B<parset> can
|
||||
set variables: It runs in the shell which is calling it.
|
||||
|
||||
It is also the reason why B<parset> does not work, when data is piped
|
||||
into it: B<... | parset ...> makes B<parset> start in a subshell, and
|
||||
any changes in environment can therefore not make it back to the
|
||||
calling shell.
|
||||
|
||||
|
||||
=head2 Job slots
|
||||
|
||||
The easiest way to explain what GNU B<parallel> does is to assume that
|
||||
|
@ -179,13 +190,13 @@ outage.
|
|||
|
||||
GNU B<parallel> first selects a compression program. If the user has
|
||||
not selected one, the first of these that is in $PATH is used: B<pzstd
|
||||
lbzip2 pbzip2 zstd pigz lz4 lzop plzip lzip lrz gzip pxz lzma bzip2 xz
|
||||
clzip>. They are sorted by speed on a 32 core machine.
|
||||
lbzip2 pbzip2 zstd pixz lz4 pigz lzop plzip lzip gzip lrz pxz bzip2
|
||||
lzma xz clzip>. They are sorted by speed on a 128 core machine.
|
||||
|
||||
Schematically the setup is as follows:
|
||||
|
||||
command started by parallel | compress > tmpfile
|
||||
cattail tmpfile | uncompress | parallel
|
||||
cattail tmpfile | uncompress | parallel which reads the output
|
||||
|
||||
The setup is duplicated for both standard output (stdout) and standard
|
||||
error (stderr).
|
||||
|
@ -200,7 +211,7 @@ program is dead. If the compress program is dead, B<cattail> reads the
|
|||
rest of tmpfile and exits.
|
||||
|
||||
As most compression programs write out a header when they start, the
|
||||
tmpfile in practice is unlinked after around 40 ms.
|
||||
tmpfile in practice is removed by B<cattail> after around 40 ms.
|
||||
|
||||
|
||||
=head2 Wrapping
|
||||
|
@ -441,6 +452,10 @@ To run the commands in a B<tmux> session you basically just need to
|
|||
quote the command. For simple commands that is easy, but when commands
|
||||
contain special characters, it gets much harder to get right.
|
||||
|
||||
B<--compress> not only compresses standard output (stdout) but also
|
||||
standard error (stderr); and it does so into files, that are open but
|
||||
deleted, so a crash will not leave these files around.
|
||||
|
||||
B<--cat> and B<--fifo> are easy to do by hand, until you want to clean
|
||||
up the tmpfile and keep the exit code of the command.
|
||||
|
||||
|
@ -639,11 +654,13 @@ When run using B<exec>.
|
|||
When run as the last command using B<-c> from another shell (because
|
||||
some shells use B<exec>):
|
||||
|
||||
zsh% bash -c "parallel 'echo {} is not run in bash; set | grep BASH_VERSION' ::: This"
|
||||
zsh% bash -c "parallel 'echo {} is not run in bash; \
|
||||
set | grep BASH_VERSION' ::: This"
|
||||
|
||||
You can work around that by appending '&& true':
|
||||
|
||||
zsh% bash -c "parallel 'echo {} is run in bash; set | grep BASH_VERSION' ::: This && true"
|
||||
zsh% bash -c "parallel 'echo {} is run in bash; \
|
||||
set | grep BASH_VERSION' ::: This && true"
|
||||
|
||||
=item *
|
||||
|
||||
|
@ -865,6 +882,49 @@ size 8193 was chosen because 8192 gave wrong result on some file
|
|||
systems, whereas 8193 did the correct thing on all tested filesystems.
|
||||
|
||||
|
||||
=head2 Memory usage
|
||||
|
||||
Normally GNU B<parallel> will use around 17 MB RAM constantly - no
|
||||
matter how many jobs or how much output there is. There are a few
|
||||
things that cause the memory usage to rise:
|
||||
|
||||
=over 3
|
||||
|
||||
=item *
|
||||
|
||||
Multiple input sources. GNU B<parallel> reads an input source only
|
||||
once. This is by design, as an input source can be a stream
|
||||
(e.g. FIFO, pipe, standard input (stdin)) which cannot be rewound and
|
||||
read again. When reading a single input source, the memory is freed as
|
||||
soon as the job is done - thus keeping the memory usage constant.
|
||||
|
||||
But when reading multiple input sources GNU B<parallel> keeps the
|
||||
already read values for generating all combinations with other input
|
||||
sources.
|
||||
|
||||
=item *
|
||||
|
||||
Computing the number of jobs. B<--bar>, B<--eta>, and B<--halt xx%>
|
||||
use B<total_jobs()> to compute the total number of jobs. It does this
|
||||
by generating the data structures for all jobs. All these job data
|
||||
structures will be stored in memory and take up around 400 bytes/job.
|
||||
|
||||
=item *
|
||||
|
||||
Buffering a full line. B<--linebuffer> will read a full line per
|
||||
running job. A very long output line (say 1 GB without \n) will
|
||||
increase RAM usage temporarily: From when the beginning of the line is
|
||||
read till the line is printed.
|
||||
|
||||
=item *
|
||||
|
||||
Buffering the full output of a single job. This happens when using
|
||||
B<--results *.csv/*.tsv> or B<--sql*>. Here GNU B<parallel> will read
|
||||
the whole output of a single job and save it as csv/tsv or SQL.
|
||||
|
||||
=back
|
||||
|
||||
|
||||
=head2 Perl replacement strings, {= =}, and --rpl
|
||||
|
||||
The shorthands for replacement strings make a command look more
|
||||
|
@ -889,9 +949,8 @@ look like a matching pair. B<--parens> was made, so that the users can
|
|||
still use ,, and ,, if they like: B<--parens ,,,,>
|
||||
|
||||
Internally, however, the {= and =} are replaced by \257< and
|
||||
\257>. This is to make it simple to make regular expressions: \257 is
|
||||
disallowed on the command line, so when that is matched in a regular
|
||||
expression, it is known that this is a replacement string.
|
||||
\257>. This is to make it simpler to make regular expressions. You
|
||||
only need to look one character ahead, and never have to look behind.
|
||||
|
||||
|
||||
=head2 Test suite
|
||||
|
@ -1001,6 +1060,31 @@ reserved word in MySQL.
|
|||
The logo is inspired by the Cafe Wall illusion. The font is DejaVu
|
||||
Sans.
|
||||
|
||||
=head2 Citation notice
|
||||
|
||||
Funding a free software project is hard. GNU B<parallel> is no
|
||||
exception. On top of that it seems the less visible a project is, the
|
||||
harder it is to get funding. And the nature of GNU B<parallel> is that
|
||||
it will never be seen by "the guy with the checkbook", but only by the
|
||||
people doing the actual work.
|
||||
|
||||
This problem has been covered by others - though no solution has been
|
||||
found: https://www.slideshare.net/NadiaEghbal/consider-the-maintainer
|
||||
https://www.numfocus.org/blog/why-is-numpy-only-now-getting-funded/
|
||||
|
||||
Before implementing the citation notice it was discussed with the
|
||||
users:
|
||||
https://lists.gnu.org/archive/html/parallel/2013-11/msg00006.html
|
||||
|
||||
There is no doubt that this is not an ideal solution, but no one has
|
||||
so far come up with an ideal solution - neither for maintaining GNU
|
||||
B<parallel> nor other free software.
|
||||
|
||||
If you believe you have the perfect solution, you should try it out,
|
||||
and if it works, you should post it on the email list. Ideas that will
|
||||
cost work and which have not been tested are, however, unlikely to be
|
||||
prioritized.
|
||||
|
||||
|
||||
=head1 Ideas for new design
|
||||
|
||||
|
@ -1024,6 +1108,9 @@ Will that require 2x block size memory?
|
|||
|
||||
=head1 Historical decisions
|
||||
|
||||
These decisions were relevant for earlier versions of GNU B<parallel>,
|
||||
but not the current version. They are kept here as historical record.
|
||||
|
||||
=head2 --tollef
|
||||
|
||||
You can read about the history of GNU B<parallel> on
|
||||
|
|
|
@ -1552,6 +1552,17 @@ Output:
|
|||
|
||||
=head2 Termination
|
||||
|
||||
=head3 Unconditional termination
|
||||
|
||||
By default GNU B<parallel> will wait for all jobs to finish before exiting.
|
||||
|
||||
If you send GNU B<parallel> the B<TERM> signal, GNU B<parallel> will
|
||||
stop spawning new jobs and wait for the remaining jobs to finish. If
|
||||
you send GNU B<parallel> the B<TERM> signal again, GNU B<parallel>
|
||||
will kill all running jobs and exit.
|
||||
|
||||
=head3 Termination dependent on job status
|
||||
|
||||
For certain jobs there is no need to continue if one of the jobs fails
|
||||
and has an exit code different from 0. GNU B<parallel> will stop spawning new jobs
|
||||
with B<--halt soon,fail=1>:
|
||||
|
@ -2147,6 +2158,49 @@ Output will be similar to:
|
|||
./.parallel/tmp/aspire-1928520-1\;\);ssh server -- rm -rf
|
||||
.parallel/tmp/aspire-1928520-1; exit $_EXIT_status;
|
||||
|
||||
=head1 Saving output to shell variables (advanced)
|
||||
|
||||
GNU B<parset> will set shell variables to the output of GNU
|
||||
B<parallel>. GNU B<parset> has one important limitation: It cannot be
|
||||
part of a pipe. In particular this means it cannot read anything from
|
||||
standard input (stdin) or pipe output to another program.
|
||||
|
||||
To use GNU B<parset> prepend command with destination variables:
|
||||
|
||||
parset myvar1,myvar2 echo ::: a b
|
||||
echo $myvar1
|
||||
echo $myvar2
|
||||
|
||||
Output:
|
||||
|
||||
a
|
||||
b
|
||||
|
||||
If you only give a single variable, it will be treated as an array:
|
||||
|
||||
parset myarray seq {} 5 ::: 1 2 3
|
||||
echo "${myarray[1]}"
|
||||
|
||||
Output:
|
||||
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
|
||||
The commands to run can be an array:
|
||||
|
||||
cmd=("echo '<<joe \"double space\" cartoon>>'" "pwd")
|
||||
parset data ::: "${cmd[@]}"
|
||||
echo "${data[0]}"
|
||||
echo "${data[1]}"
|
||||
|
||||
Output:
|
||||
|
||||
<<joe "double space" cartoon>>
|
||||
[current dir]
|
||||
|
||||
|
||||
=head1 Saving to an SQL base (advanced)
|
||||
|
||||
GNU B<parallel> can save into an SQL base. Point GNU B<parallel> to a
|
||||
|
@ -3038,9 +3092,9 @@ Give a demo at your local user group/your team/your colleagues
|
|||
|
||||
=item *
|
||||
|
||||
Post the intro videos and the tutorial on Reddit, Diaspora*,
|
||||
forums, blogs, Identi.ca, Google+, Twitter, Facebook, Linkedin,
|
||||
and mailing lists
|
||||
Post the intro videos and the tutorial on Reddit, Mastodon, Diaspora*,
|
||||
forums, blogs, Identi.ca, Google+, Twitter, Facebook, Linkedin, and
|
||||
mailing lists
|
||||
|
||||
=item *
|
||||
|
||||
|
|
|
@ -56,21 +56,34 @@ Put output into vars B<$seq, $pwd, $ls>:
|
|||
into_vars=(seq pwd ls)
|
||||
parset "${into_vars[*]}" ::: "seq 10" pwd ls
|
||||
echo "$ls"
|
||||
|
||||
|
||||
The commands to run can be an array:
|
||||
|
||||
cmd=("echo '<<joe \"double space\" cartoon>>'" "pwd")
|
||||
parset data ::: "${cmd[@]}"
|
||||
echo "${data[0]}"
|
||||
echo "${data[1]}"
|
||||
|
||||
You cannot pipe into B<parset>, but must use a tempfile:
|
||||
|
||||
B<parset> can not be part of a pipe. In particular this means it
|
||||
cannot read anything from standard input (stdin) or write to a pipe:
|
||||
|
||||
seq 10 | parset res echo Does not work
|
||||
|
||||
but must instead use a tempfile:
|
||||
|
||||
seq 10 > parallel_input
|
||||
parset res echo :::: parallel_input
|
||||
echo "${res[0]}"
|
||||
echo "${res[9]}"
|
||||
|
||||
or a FIFO:
|
||||
|
||||
mkfifo input_fifo
|
||||
seq 30 > input_fifo &
|
||||
parset res echo :::: input_fifo
|
||||
echo "${res[0]}"
|
||||
echo "${res[29]}"
|
||||
|
||||
or Bash process substitution:
|
||||
|
||||
parset res echo :::: <(seq 100)
|
||||
|
|
Loading…
Reference in a new issue