parallel: Fixed bug #56275: --shard with named column and perl expr.

2024-11-25 15:37:56 +00:00 · 2019-06-17 01:21:45 +02:00 · 2019-06-17 01:21:45 +02:00 · 5d81c7c053
parent 17a5fd702d
commit 5d81c7c053
10 changed files with 206 additions and 49 deletions
--- a/doc/haikus
+++ b/doc/haikus
@ -1,5 +1,17 @@
 Quote of the month:
 I love so much @GnuParallel to parallelize my SQL requests on @PostgreSQL
  -- @rmaziere_85 Romain
 I want to make a shout-out for @GnuParallel, it's a work of beauty and power
  -- Cristian Consonni @CristianCantoro
 Parallel is the BEST command.
  -- Nick @NickInfoSec
 It is SUPER easy to speed up jobs from the command line w/ GNU parallel.
  -- B3n @B3njaminHimes@twitter
 GNU parallel really changed how I do a lot of data processing stuff
  -- Brendan Dolan-Gavitt @moyix@twitter
--- a/doc/promo
+++ b/doc/promo
@ -38,11 +38,20 @@ newest version.
 Before GNU Parallel was a GNU tool, it started as a wrapper around
 `make -j`. But GNU Parallel grew, and was no longer just a small
-hack. To make the code easier to maintain it was rewritten to object
+hack.
 orientation.
-This would not have been possible if the test suite had not been so
+The design goals included not requiring a compiler, compatibility with
-thorough: It made it much easier to see if 
+old operating systems, and single file program. This limited the
 languages tremendously.
 Perl and Python were in practice the only possibilities. Python was at
 the time quite slow, ressource hungry, and not as widely installed as
 Perl. So Perl was the choice.
 To make the code easier to maintain it was rewritten to object
 orientation. This would not have been possible if the test suite had
 not been so thorough: It made it much easier to see if a code change
 cause change in behaviour.
 =head2 --tollef
--- a/doc/release_new_version
+++ b/doc/release_new_version
@ -206,9 +206,9 @@ from:tange@gnu.org
 to:parallel@gnu.org, bug-parallel@gnu.org
 stable-bcc: Jesse Alama <jessealama@fastmail.fm>
-Subject: GNU Parallel 20190522 ('Akihito') released <<[stable]>>
+Subject: GNU Parallel 20190622 ('Frederiksen/Saybie/HongKong') released <<[stable]>>
-GNU Parallel 20190522 ('') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
+GNU Parallel 20190622 ('') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
 <<No new functionality was introduced so this is a good candidate for a stable release.>>
@ -218,19 +218,16 @@ See https://www.gnu.org/software/parallel/10-years-anniversary.html
 Quote of the month:
-  Amazingly useful script!
+<<>>
    -- unxusr@reddit.com
 New in this release:
-* --group-by groups lines depending on value of a column. The value can be computed.
+https://livefreeordichotomize.com/2019/06/04/using_awk_and_r_to_parse_25tb/
-* How to compress (bzip / gzip) a very large text quickly? https://medium.com/@gchandra/how-to-compress-bzip-gzip-a-very-large-text-quickly-27c11f4c6681
+https://zh.wikipedia.org/wiki/GNU_parallel
 chck NEWS http://parallelandvisualtestingwithbehat.blogspot.com/p/blog-page.html
-* Simple tutorial to install & use GNU Parallel https://medium.com/@gchandra/simple-tutorial-to-install-use-gnu-parallel-79251120d618
+* GNU Parallel Akihito released https://linuxreviews.org/GNU_Parallel_Akihito_released
 * Introducing Parallel into Shell https://petelawson.com/post/parallel-in-shell/
 * Bug fixes and man page updates.
--- a/src/parallel
+++ b/src/parallel
@ -378,6 +378,8 @@ sub sharder_script() {
 	my $col = shift;
 	# Which columns to shard on (count from 0)
 	my $col0 = $col - 1;
 	# Perl expression
 	my $perlexpr = shift;
 	my $bins = @ARGV;
 	# Open fifos for writing, fh{0..$bins}
 	my $t = 0;
@ -388,12 +390,26 @@ sub sharder_script() {
 	    # so unlink only happens when it is ready
 	    unlink $_;
 	}
        if($perlexpr) {
 	    my $subref = eval("sub { no strict; no warnings; $perlexpr }");
 	    while(<STDIN>) {
 		# Split into $col columns (no need to split into more)
 		@F = split $sep, $_, $col+1;
 		{
 		    local $_ = $F[$col0];
 		    &$subref();
 		    $fh = $fh{ hex(B::hash($_))%$bins };
 		}
 		print $fh $_;
 	    }
 	} else {
 	    while(<STDIN>) {
 		# Split into $col columns (no need to split into more)
 		@F = split $sep, $_, $col+1;
 		$fh = $fh{ hex(B::hash($F[$col0]))%$bins };
 		print $fh $_;
 	    }
 	}
 	# Close all open fifos
 	close values %fh;
    };
@ -423,6 +439,20 @@ sub pipe_shard_setup() {
    my $script = sharder_script();
    # cat foo | sharder sep col fifo1 fifo2 fifo3 ... fifoN
    if($opt::shard =~ /^[a-z_][a-z_0-9]*(\s|$)/i) {
 	# Group by column name
 	# (Yes, this will also wrongly match a perlexpr like: chop)
 	my($read,$char,@line);
 	# A full line, but nothing more (the rest must be read by the child)
 	# $Global::header used to prepend block to each job
        do {
 	    $read = sysread(STDIN,$char,1);
 	    push @line, $char;
 	} while($read and $char ne "\n");
 	$Global::header = join "", @line;
    }
    my ($col, $perlexpr, $subref) =
 	column_perlexpr($opt::shard, $Global::header, $opt::colsep);
    if(not fork()) {
 	# Let the sharder inherit our stdin
 	# and redirect stdout to null
@ -432,7 +462,7 @@ sub pipe_shard_setup() {
 	$ENV{'PERL_HASH_SEED'} = $$;
 	exec qw(parallel --block 100k -q --pipe -j), $njobs,
 	    qw(--roundrobin -u perl -e), $script, ($opt::colsep || ","),
-	    $opt::shard, '{}', (map { (':::+', @{$_}) } @parcatfifos);
+	    $col, $perlexpr, '{}', (map { (':::+', @{$_}) } @parcatfifos);
    }
    # For each fifo
    #   (rm fifo1; grep 1) < fifo1
--- a/src/parallel.pod
+++ b/src/parallel.pod
@ -2155,13 +2155,17 @@ Only supported in B<Ash, Bash, Dash, Ksh, Sh, and Zsh>.
 See also B<--env>, B<--record-env>.
-=item B<--shard> I<shardcol> (beta testing)
+=item B<--shard> I<shardexpr> (alpha testing)
-Use column I<shardcol> as shard key and shard input to the jobs.
+Use I<shardexpr> as shard key and shard input to the jobs.
-Each input line is split using B<--colsep>. The value in the
+I<shardexpr> is [column number|column name] [perlexpression] e.g. 3,
-I<shardcol> column is hashed so that all lines of a given value is
+Address, 3 $_%=100, Address s/\d//g.
-given to the same job slot.
+
 Each input line is split using B<--colsep>. The value of the column is
 put into $_, the perl expression is executed, the resulting value is
 hashed so that all lines of a given value is given to the same job
 slot.
 This is similar to sharding in databases.
--- a/src/parset.pod
+++ b/src/parset.pod
@ -68,8 +68,12 @@ The commands to run can be an array:
  echo "${data[1]}"
  echo "${data[2]}"
-B<parset> can not be part of a pipe. In particular this means it
+B<parset> can read from stdin (standard input) if it is a file:
-cannot read anything from standard input (stdin) or write to a pipe:
+
  parset res echo < parallel_input_file
 but B<parset> can not be part of a pipe. In particular this means it
 cannot read from a pipe or write to a pipe:
  seq 10 | parset res echo Does not work
@ -124,8 +128,12 @@ Put output into vars B<$seq, $pwd, $ls>:
  parset "seq pwd ls" ::: "seq 10" pwd ls
  echo "$ls"
-B<parset> can not be part of a pipe. In particular this means it
+B<parset> can read from stdin (standard input) if it is a file:
-cannot read anything from standard input (stdin) or write to a pipe:
+
  parset res1,res2,res3 echo < parallel_input_file
 but B<parset> can not be part of a pipe. In particular this means it
 cannot read from a pipe or write to a pipe:
  seq 3 | parset res1,res2,res3 echo Does not work
--- a/testsuite/tests-to-run/parallel-local-10s.sh
+++ b/testsuite/tests-to-run/parallel-local-10s.sh
@ -4,6 +4,18 @@
 # Each should be taking 10-30s and be possible to run in parallel
 # I.e.: No race conditions, no logins
 par_kill_hup() {
    echo '### Are children killed if GNU Parallel receives HUP? There should be no sleep at the end'
    parallel -j 2 -q bash -c 'sleep {} & pid=$!; wait $pid' ::: 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 &
    T=$!
    sleep 9.9
    pstree $$
    kill -HUP $T
    sleep 2
    pstree $$
 }
 par_parset() {
    echo '### test parset'
    . `which env_parallel.bash`
--- a/testsuite/tests-to-run/parallel-local-3s.sh
+++ b/testsuite/tests-to-run/parallel-local-3s.sh
@ -61,6 +61,37 @@ par_shard() {
    }
    shard_on_col 1
    shard_on_col 2
    shard_on_col_name() {
 	colname=$1
 	col=$2
 	(echo AB; seq 10 99 | shuf) | perl -pe 's/(.)/$1\t/g' |
 	    parallel --header : --pipe --shard $colname -j2 --colsep "\t" sort -k$col |
 	    field $col | uniq -c | sort
    }
    shard_on_col_name A 1
    shard_on_col_name B 2
    shard_on_col_expr() {
 	colexpr="$1"
 	col=$2
 	(seq 10 99 | shuf) | perl -pe 's/(.)/$1\t/g' |
 	    parallel --pipe --shard "$colexpr" -j2 --colsep "\t" "sort -k$col; echo c1 c2" |
 	    field $col | uniq -c | sort
    }
    shard_on_col_expr '1 $_%=3' 1
    shard_on_col_expr '2 $_%=3' 2
    shard_on_col_name_expr() {
 	colexpr="$1"
 	col=$2
 	(echo AB; seq 10 99 | shuf) | perl -pe 's/(.)/$1\t/g' |
 	    parallel --header : --pipe --shard "$colexpr" -j2 --colsep "\t" "sort -k$col; echo c1 c2" |
 	    field $col | uniq -c | sort
    }
    shard_on_col_name_expr 'A $_%=3' 1
    shard_on_col_name_expr 'B $_%=3' 2
    echo '*** broken'
    # Shorthand for --pipe -j+0
    seq 100000 | parallel --shard 1 wc |
@ -148,18 +179,6 @@ par_kill_term() {
    pstree $$
 }
 par_kill_hup() {
    echo '### Are children killed if GNU Parallel receives HUP? There should be no sleep at the end'
    parallel -j 2 -q bash -c 'sleep {} & pid=$!; wait $pid' ::: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 &
    T=$!
    sleep 2.9
    pstree $$
    kill -HUP $T
    sleep 2
    pstree $$
 }
 par_kill_int_twice() {
    echo '### Are children killed if GNU Parallel receives INT twice? There should be no sleep at the end'
--- a/testsuite/wanted-results/parallel-local-10s
+++ b/testsuite/wanted-results/parallel-local-10s
@ -258,6 +258,15 @@ par_kill_children_timeout	parallel: Warning: This job was killed because it time
 par_kill_children_timeout	parallel: Warning: doit 1000000001
 par_kill_children_timeout	2
 par_kill_children_timeout	      0       0       0
 par_kill_hup	### Are children killed if GNU Parallel receives HUP? There should be no sleep at the end
 par_kill_hup	bash-+-perl
 par_kill_hup	     `-pstree
 par_kill_hup	parallel: SIGHUP received. No new jobs will be started.
 par_kill_hup	parallel: Waiting for these 2 jobs to finish. Send SIGTERM to stop now.
 par_kill_hup	parallel: bash -c 'sleep 3 & pid=$!; wait $pid'
 par_kill_hup	parallel: bash -c 'sleep 3 & pid=$!; wait $pid'
 par_kill_hup	bash-+-perl---2*[bash---sleep]
 par_kill_hup	     `-pstree
 par_line_buffer	### --line-buffer
 par_line_buffer	     55      55     120
 par_line_buffer	These must diff: 1
--- a/testsuite/wanted-results/parallel-local-3s
+++ b/testsuite/wanted-results/parallel-local-3s
@ -1122,14 +1122,6 @@ par_jobslot_repl	2
 par_jobslot_repl	1
 par_jobslot_repl	2
 par_jobslot_repl	1
 par_kill_hup	### Are children killed if GNU Parallel receives HUP? There should be no sleep at the end
 par_kill_hup	bash-+-perl---2*[bash---sleep]
 par_kill_hup	     `-pstree
 par_kill_hup	parallel: SIGHUP received. No new jobs will be started.
 par_kill_hup	parallel: Waiting for these 2 jobs to finish. Send SIGTERM to stop now.
 par_kill_hup	parallel: bash -c 'sleep 1 & pid=$!; wait $pid'
 par_kill_hup	parallel: bash -c 'sleep 1 & pid=$!; wait $pid'
 par_kill_hup	bash---pstree
 par_kill_int_twice	### Are children killed if GNU Parallel receives INT twice? There should be no sleep at the end
 par_kill_int_twice	bash-+-perl---bash---sleep
 par_kill_int_twice	     `-pstree
@ -1288,6 +1280,71 @@ par_shard	      9 6
 par_shard	      9 7
 par_shard	      9 8
 par_shard	      9 9
 par_shard	     10 1
 par_shard	     10 2
 par_shard	     10 3
 par_shard	     10 4
 par_shard	     10 5
 par_shard	     10 6
 par_shard	     10 7
 par_shard	     10 8
 par_shard	     10 9
 par_shard	      9 0
 par_shard	      9 1
 par_shard	      9 2
 par_shard	      9 3
 par_shard	      9 4
 par_shard	      9 5
 par_shard	      9 6
 par_shard	      9 7
 par_shard	      9 8
 par_shard	      9 9
 par_shard	     10 1
 par_shard	     10 2
 par_shard	     10 3
 par_shard	     10 4
 par_shard	     10 5
 par_shard	     10 6
 par_shard	     10 7
 par_shard	     10 8
 par_shard	     10 9
 par_shard	      1 c1
 par_shard	      1 c1
 par_shard	      1 c2
 par_shard	      1 c2
 par_shard	      9 0
 par_shard	      9 1
 par_shard	      9 2
 par_shard	      9 3
 par_shard	      9 4
 par_shard	      9 5
 par_shard	      9 6
 par_shard	      9 7
 par_shard	      9 8
 par_shard	      9 9
 par_shard	     10 1
 par_shard	     10 2
 par_shard	     10 3
 par_shard	     10 4
 par_shard	     10 5
 par_shard	     10 6
 par_shard	     10 7
 par_shard	     10 8
 par_shard	     10 9
 par_shard	      1 c1
 par_shard	      1 c1
 par_shard	      1 c2
 par_shard	      1 c2
 par_shard	      9 0
 par_shard	      9 1
 par_shard	      9 2
 par_shard	      9 3
 par_shard	      9 4
 par_shard	      9 5
 par_shard	      9 6
 par_shard	      9 7
 par_shard	      9 8
 par_shard	      9 9
 par_shard	*** broken
 par_shard	parallel: Error: --tee requres --jobs to be higher. Try --jobs 0.
 par_shard	parallel: Error: --tee requres --jobs to be higher. Try --jobs 0.