mirror of
https://git.savannah.gnu.org/git/parallel.git
synced 2024-11-22 05:57:54 +00:00
parallel: Fixed bug #56275: --shard with named column and perl expr.
This commit is contained in:
parent
17a5fd702d
commit
5d81c7c053
12
doc/haikus
12
doc/haikus
|
@ -1,5 +1,17 @@
|
|||
Quote of the month:
|
||||
|
||||
I love so much @GnuParallel to parallelize my SQL requests on @PostgreSQL
|
||||
-- @rmaziere_85 Romain
|
||||
|
||||
I want to make a shout-out for @GnuParallel, it's a work of beauty and power
|
||||
-- Cristian Consonni @CristianCantoro
|
||||
|
||||
Parallel is the BEST command.
|
||||
-- Nick @NickInfoSec
|
||||
|
||||
It is SUPER easy to speed up jobs from the command line w/ GNU parallel.
|
||||
-- B3n @B3njaminHimes@twitter
|
||||
|
||||
GNU parallel really changed how I do a lot of data processing stuff
|
||||
-- Brendan Dolan-Gavitt @moyix@twitter
|
||||
|
||||
|
|
17
doc/promo
17
doc/promo
|
@ -38,11 +38,20 @@ newest version.
|
|||
|
||||
Before GNU Parallel was a GNU tool, it started as a wrapper around
|
||||
`make -j`. But GNU Parallel grew, and was no longer just a small
|
||||
hack. To make the code easier to maintain it was rewritten to object
|
||||
orientation.
|
||||
hack.
|
||||
|
||||
This would not have been possible if the test suite had not been so
|
||||
thorough: It made it much easier to see if
|
||||
The design goals included not requiring a compiler, compatibility with
|
||||
old operating systems, and single file program. This limited the
|
||||
languages tremendously.
|
||||
|
||||
Perl and Python were in practice the only possibilities. Python was at
|
||||
the time quite slow, ressource hungry, and not as widely installed as
|
||||
Perl. So Perl was the choice.
|
||||
|
||||
To make the code easier to maintain it was rewritten to object
|
||||
orientation. This would not have been possible if the test suite had
|
||||
not been so thorough: It made it much easier to see if a code change
|
||||
cause change in behaviour.
|
||||
|
||||
|
||||
=head2 --tollef
|
||||
|
|
|
@ -206,9 +206,9 @@ from:tange@gnu.org
|
|||
to:parallel@gnu.org, bug-parallel@gnu.org
|
||||
stable-bcc: Jesse Alama <jessealama@fastmail.fm>
|
||||
|
||||
Subject: GNU Parallel 20190522 ('Akihito') released <<[stable]>>
|
||||
Subject: GNU Parallel 20190622 ('Frederiksen/Saybie/HongKong') released <<[stable]>>
|
||||
|
||||
GNU Parallel 20190522 ('') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
|
||||
GNU Parallel 20190622 ('') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
|
||||
|
||||
<<No new functionality was introduced so this is a good candidate for a stable release.>>
|
||||
|
||||
|
@ -218,19 +218,16 @@ See https://www.gnu.org/software/parallel/10-years-anniversary.html
|
|||
|
||||
Quote of the month:
|
||||
|
||||
Amazingly useful script!
|
||||
-- unxusr@reddit.com
|
||||
|
||||
<<>>
|
||||
|
||||
New in this release:
|
||||
|
||||
* --group-by groups lines depending on value of a column. The value can be computed.
|
||||
https://livefreeordichotomize.com/2019/06/04/using_awk_and_r_to_parse_25tb/
|
||||
|
||||
* How to compress (bzip / gzip) a very large text quickly? https://medium.com/@gchandra/how-to-compress-bzip-gzip-a-very-large-text-quickly-27c11f4c6681
|
||||
https://zh.wikipedia.org/wiki/GNU_parallel
|
||||
chck NEWS http://parallelandvisualtestingwithbehat.blogspot.com/p/blog-page.html
|
||||
|
||||
* Simple tutorial to install & use GNU Parallel https://medium.com/@gchandra/simple-tutorial-to-install-use-gnu-parallel-79251120d618
|
||||
|
||||
* Introducing Parallel into Shell https://petelawson.com/post/parallel-in-shell/
|
||||
* GNU Parallel Akihito released https://linuxreviews.org/GNU_Parallel_Akihito_released
|
||||
|
||||
* Bug fixes and man page updates.
|
||||
|
||||
|
|
32
src/parallel
32
src/parallel
|
@ -378,6 +378,8 @@ sub sharder_script() {
|
|||
my $col = shift;
|
||||
# Which columns to shard on (count from 0)
|
||||
my $col0 = $col - 1;
|
||||
# Perl expression
|
||||
my $perlexpr = shift;
|
||||
my $bins = @ARGV;
|
||||
# Open fifos for writing, fh{0..$bins}
|
||||
my $t = 0;
|
||||
|
@ -388,12 +390,26 @@ sub sharder_script() {
|
|||
# so unlink only happens when it is ready
|
||||
unlink $_;
|
||||
}
|
||||
if($perlexpr) {
|
||||
my $subref = eval("sub { no strict; no warnings; $perlexpr }");
|
||||
while(<STDIN>) {
|
||||
# Split into $col columns (no need to split into more)
|
||||
@F = split $sep, $_, $col+1;
|
||||
{
|
||||
local $_ = $F[$col0];
|
||||
&$subref();
|
||||
$fh = $fh{ hex(B::hash($_))%$bins };
|
||||
}
|
||||
print $fh $_;
|
||||
}
|
||||
} else {
|
||||
while(<STDIN>) {
|
||||
# Split into $col columns (no need to split into more)
|
||||
@F = split $sep, $_, $col+1;
|
||||
$fh = $fh{ hex(B::hash($F[$col0]))%$bins };
|
||||
print $fh $_;
|
||||
}
|
||||
}
|
||||
# Close all open fifos
|
||||
close values %fh;
|
||||
};
|
||||
|
@ -423,6 +439,20 @@ sub pipe_shard_setup() {
|
|||
my $script = sharder_script();
|
||||
# cat foo | sharder sep col fifo1 fifo2 fifo3 ... fifoN
|
||||
|
||||
if($opt::shard =~ /^[a-z_][a-z_0-9]*(\s|$)/i) {
|
||||
# Group by column name
|
||||
# (Yes, this will also wrongly match a perlexpr like: chop)
|
||||
my($read,$char,@line);
|
||||
# A full line, but nothing more (the rest must be read by the child)
|
||||
# $Global::header used to prepend block to each job
|
||||
do {
|
||||
$read = sysread(STDIN,$char,1);
|
||||
push @line, $char;
|
||||
} while($read and $char ne "\n");
|
||||
$Global::header = join "", @line;
|
||||
}
|
||||
my ($col, $perlexpr, $subref) =
|
||||
column_perlexpr($opt::shard, $Global::header, $opt::colsep);
|
||||
if(not fork()) {
|
||||
# Let the sharder inherit our stdin
|
||||
# and redirect stdout to null
|
||||
|
@ -432,7 +462,7 @@ sub pipe_shard_setup() {
|
|||
$ENV{'PERL_HASH_SEED'} = $$;
|
||||
exec qw(parallel --block 100k -q --pipe -j), $njobs,
|
||||
qw(--roundrobin -u perl -e), $script, ($opt::colsep || ","),
|
||||
$opt::shard, '{}', (map { (':::+', @{$_}) } @parcatfifos);
|
||||
$col, $perlexpr, '{}', (map { (':::+', @{$_}) } @parcatfifos);
|
||||
}
|
||||
# For each fifo
|
||||
# (rm fifo1; grep 1) < fifo1
|
||||
|
|
|
@ -2155,13 +2155,17 @@ Only supported in B<Ash, Bash, Dash, Ksh, Sh, and Zsh>.
|
|||
See also B<--env>, B<--record-env>.
|
||||
|
||||
|
||||
=item B<--shard> I<shardcol> (beta testing)
|
||||
=item B<--shard> I<shardexpr> (alpha testing)
|
||||
|
||||
Use column I<shardcol> as shard key and shard input to the jobs.
|
||||
Use I<shardexpr> as shard key and shard input to the jobs.
|
||||
|
||||
Each input line is split using B<--colsep>. The value in the
|
||||
I<shardcol> column is hashed so that all lines of a given value is
|
||||
given to the same job slot.
|
||||
I<shardexpr> is [column number|column name] [perlexpression] e.g. 3,
|
||||
Address, 3 $_%=100, Address s/\d//g.
|
||||
|
||||
Each input line is split using B<--colsep>. The value of the column is
|
||||
put into $_, the perl expression is executed, the resulting value is
|
||||
hashed so that all lines of a given value is given to the same job
|
||||
slot.
|
||||
|
||||
This is similar to sharding in databases.
|
||||
|
||||
|
|
|
@ -68,8 +68,12 @@ The commands to run can be an array:
|
|||
echo "${data[1]}"
|
||||
echo "${data[2]}"
|
||||
|
||||
B<parset> can not be part of a pipe. In particular this means it
|
||||
cannot read anything from standard input (stdin) or write to a pipe:
|
||||
B<parset> can read from stdin (standard input) if it is a file:
|
||||
|
||||
parset res echo < parallel_input_file
|
||||
|
||||
but B<parset> can not be part of a pipe. In particular this means it
|
||||
cannot read from a pipe or write to a pipe:
|
||||
|
||||
seq 10 | parset res echo Does not work
|
||||
|
||||
|
@ -124,8 +128,12 @@ Put output into vars B<$seq, $pwd, $ls>:
|
|||
parset "seq pwd ls" ::: "seq 10" pwd ls
|
||||
echo "$ls"
|
||||
|
||||
B<parset> can not be part of a pipe. In particular this means it
|
||||
cannot read anything from standard input (stdin) or write to a pipe:
|
||||
B<parset> can read from stdin (standard input) if it is a file:
|
||||
|
||||
parset res1,res2,res3 echo < parallel_input_file
|
||||
|
||||
but B<parset> can not be part of a pipe. In particular this means it
|
||||
cannot read from a pipe or write to a pipe:
|
||||
|
||||
seq 3 | parset res1,res2,res3 echo Does not work
|
||||
|
||||
|
|
|
@ -4,6 +4,18 @@
|
|||
# Each should be taking 10-30s and be possible to run in parallel
|
||||
# I.e.: No race conditions, no logins
|
||||
|
||||
par_kill_hup() {
|
||||
echo '### Are children killed if GNU Parallel receives HUP? There should be no sleep at the end'
|
||||
|
||||
parallel -j 2 -q bash -c 'sleep {} & pid=$!; wait $pid' ::: 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 &
|
||||
T=$!
|
||||
sleep 9.9
|
||||
pstree $$
|
||||
kill -HUP $T
|
||||
sleep 2
|
||||
pstree $$
|
||||
}
|
||||
|
||||
par_parset() {
|
||||
echo '### test parset'
|
||||
. `which env_parallel.bash`
|
||||
|
|
|
@ -61,6 +61,37 @@ par_shard() {
|
|||
}
|
||||
shard_on_col 1
|
||||
shard_on_col 2
|
||||
|
||||
shard_on_col_name() {
|
||||
colname=$1
|
||||
col=$2
|
||||
(echo AB; seq 10 99 | shuf) | perl -pe 's/(.)/$1\t/g' |
|
||||
parallel --header : --pipe --shard $colname -j2 --colsep "\t" sort -k$col |
|
||||
field $col | uniq -c | sort
|
||||
}
|
||||
shard_on_col_name A 1
|
||||
shard_on_col_name B 2
|
||||
|
||||
shard_on_col_expr() {
|
||||
colexpr="$1"
|
||||
col=$2
|
||||
(seq 10 99 | shuf) | perl -pe 's/(.)/$1\t/g' |
|
||||
parallel --pipe --shard "$colexpr" -j2 --colsep "\t" "sort -k$col; echo c1 c2" |
|
||||
field $col | uniq -c | sort
|
||||
}
|
||||
shard_on_col_expr '1 $_%=3' 1
|
||||
shard_on_col_expr '2 $_%=3' 2
|
||||
|
||||
shard_on_col_name_expr() {
|
||||
colexpr="$1"
|
||||
col=$2
|
||||
(echo AB; seq 10 99 | shuf) | perl -pe 's/(.)/$1\t/g' |
|
||||
parallel --header : --pipe --shard "$colexpr" -j2 --colsep "\t" "sort -k$col; echo c1 c2" |
|
||||
field $col | uniq -c | sort
|
||||
}
|
||||
shard_on_col_name_expr 'A $_%=3' 1
|
||||
shard_on_col_name_expr 'B $_%=3' 2
|
||||
|
||||
echo '*** broken'
|
||||
# Shorthand for --pipe -j+0
|
||||
seq 100000 | parallel --shard 1 wc |
|
||||
|
@ -148,18 +179,6 @@ par_kill_term() {
|
|||
pstree $$
|
||||
}
|
||||
|
||||
par_kill_hup() {
|
||||
echo '### Are children killed if GNU Parallel receives HUP? There should be no sleep at the end'
|
||||
|
||||
parallel -j 2 -q bash -c 'sleep {} & pid=$!; wait $pid' ::: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 &
|
||||
T=$!
|
||||
sleep 2.9
|
||||
pstree $$
|
||||
kill -HUP $T
|
||||
sleep 2
|
||||
pstree $$
|
||||
}
|
||||
|
||||
par_kill_int_twice() {
|
||||
echo '### Are children killed if GNU Parallel receives INT twice? There should be no sleep at the end'
|
||||
|
||||
|
|
|
@ -258,6 +258,15 @@ par_kill_children_timeout parallel: Warning: This job was killed because it time
|
|||
par_kill_children_timeout parallel: Warning: doit 1000000001
|
||||
par_kill_children_timeout 2
|
||||
par_kill_children_timeout 0 0 0
|
||||
par_kill_hup ### Are children killed if GNU Parallel receives HUP? There should be no sleep at the end
|
||||
par_kill_hup bash-+-perl
|
||||
par_kill_hup `-pstree
|
||||
par_kill_hup parallel: SIGHUP received. No new jobs will be started.
|
||||
par_kill_hup parallel: Waiting for these 2 jobs to finish. Send SIGTERM to stop now.
|
||||
par_kill_hup parallel: bash -c 'sleep 3 & pid=$!; wait $pid'
|
||||
par_kill_hup parallel: bash -c 'sleep 3 & pid=$!; wait $pid'
|
||||
par_kill_hup bash-+-perl---2*[bash---sleep]
|
||||
par_kill_hup `-pstree
|
||||
par_line_buffer ### --line-buffer
|
||||
par_line_buffer 55 55 120
|
||||
par_line_buffer These must diff: 1
|
||||
|
|
|
@ -1122,14 +1122,6 @@ par_jobslot_repl 2
|
|||
par_jobslot_repl 1
|
||||
par_jobslot_repl 2
|
||||
par_jobslot_repl 1
|
||||
par_kill_hup ### Are children killed if GNU Parallel receives HUP? There should be no sleep at the end
|
||||
par_kill_hup bash-+-perl---2*[bash---sleep]
|
||||
par_kill_hup `-pstree
|
||||
par_kill_hup parallel: SIGHUP received. No new jobs will be started.
|
||||
par_kill_hup parallel: Waiting for these 2 jobs to finish. Send SIGTERM to stop now.
|
||||
par_kill_hup parallel: bash -c 'sleep 1 & pid=$!; wait $pid'
|
||||
par_kill_hup parallel: bash -c 'sleep 1 & pid=$!; wait $pid'
|
||||
par_kill_hup bash---pstree
|
||||
par_kill_int_twice ### Are children killed if GNU Parallel receives INT twice? There should be no sleep at the end
|
||||
par_kill_int_twice bash-+-perl---bash---sleep
|
||||
par_kill_int_twice `-pstree
|
||||
|
@ -1288,6 +1280,71 @@ par_shard 9 6
|
|||
par_shard 9 7
|
||||
par_shard 9 8
|
||||
par_shard 9 9
|
||||
par_shard 10 1
|
||||
par_shard 10 2
|
||||
par_shard 10 3
|
||||
par_shard 10 4
|
||||
par_shard 10 5
|
||||
par_shard 10 6
|
||||
par_shard 10 7
|
||||
par_shard 10 8
|
||||
par_shard 10 9
|
||||
par_shard 9 0
|
||||
par_shard 9 1
|
||||
par_shard 9 2
|
||||
par_shard 9 3
|
||||
par_shard 9 4
|
||||
par_shard 9 5
|
||||
par_shard 9 6
|
||||
par_shard 9 7
|
||||
par_shard 9 8
|
||||
par_shard 9 9
|
||||
par_shard 10 1
|
||||
par_shard 10 2
|
||||
par_shard 10 3
|
||||
par_shard 10 4
|
||||
par_shard 10 5
|
||||
par_shard 10 6
|
||||
par_shard 10 7
|
||||
par_shard 10 8
|
||||
par_shard 10 9
|
||||
par_shard 1 c1
|
||||
par_shard 1 c1
|
||||
par_shard 1 c2
|
||||
par_shard 1 c2
|
||||
par_shard 9 0
|
||||
par_shard 9 1
|
||||
par_shard 9 2
|
||||
par_shard 9 3
|
||||
par_shard 9 4
|
||||
par_shard 9 5
|
||||
par_shard 9 6
|
||||
par_shard 9 7
|
||||
par_shard 9 8
|
||||
par_shard 9 9
|
||||
par_shard 10 1
|
||||
par_shard 10 2
|
||||
par_shard 10 3
|
||||
par_shard 10 4
|
||||
par_shard 10 5
|
||||
par_shard 10 6
|
||||
par_shard 10 7
|
||||
par_shard 10 8
|
||||
par_shard 10 9
|
||||
par_shard 1 c1
|
||||
par_shard 1 c1
|
||||
par_shard 1 c2
|
||||
par_shard 1 c2
|
||||
par_shard 9 0
|
||||
par_shard 9 1
|
||||
par_shard 9 2
|
||||
par_shard 9 3
|
||||
par_shard 9 4
|
||||
par_shard 9 5
|
||||
par_shard 9 6
|
||||
par_shard 9 7
|
||||
par_shard 9 8
|
||||
par_shard 9 9
|
||||
par_shard *** broken
|
||||
par_shard parallel: Error: --tee requres --jobs to be higher. Try --jobs 0.
|
||||
par_shard parallel: Error: --tee requres --jobs to be higher. Try --jobs 0.
|
||||
|
|
Loading…
Reference in a new issue