parallel: Fixed bug #59843: --regexp --recstart '#' fails.

2024-11-21 21:47:54 +00:00 · 2021-01-08 15:36:05 +01:00 · 2021-01-08 15:36:05 +01:00 · f539554727
parent 2f28e78c0b
commit f539554727
11 changed files with 119 additions and 121 deletions
--- a/doc/haikus
+++ b/doc/haikus
@ -1,8 +1,11 @@
 Quote of the month:

+  It's really quite amazing how powerful and flexible it is
+    -- schwanengesang @tensegrist@twitter
+
  Every time I install @ubuntu, one of the first tools I install is
  @gnuparallel. I love it.
-    -- Necati Demir @ndemir
+    -- Necati Demir @ndemir@twitter

  Today I'm grateful for GNU parallel, especially with the --colsep and
  --jobs parameters #GiveThanks
--- a/doc/release_new_version
+++ b/doc/release_new_version
@ -192,7 +192,7 @@ from:tange@gnu.org
 to:parallel@gnu.org, bug-parallel@gnu.org
 stable-bcc: Jesse Alama <jessealama@fastmail.fm>

-Subject: GNU Parallel 20210122 ('') released <<[stable]>>
+Subject: GNU Parallel 20210122 ('Capitol Riots') released <<[stable]>>

 GNU Parallel 20210122 ('') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/

@ -213,6 +213,8 @@ New in this release:

 News about GNU Parallel:

+https://www.codenong.com/25172209/
+
 <<>>

 Get the book: GNU Parallel 2018 http://www.lulu.com/shop/ole-tange/gnu-parallel-2018/paperback/product-23558902.html
--- a/src/parallel
+++ b/src/parallel
@ -322,6 +322,7 @@ sub parcat_script() {
 		    for $infh (@ready) {
 			# There is only one key, namely the output file descriptor
 			for my $outfd (keys %{$buffer{$infh}}) {
+			    # TODO test if 65536 is optimal (2^17 is used elsewhere)
 			    $rv = sysread($infh, $buf, 65536);
 			    if (!$rv) {
 				if($! == EAGAIN) {
@ -624,6 +625,7 @@ sub find_split_positions($$$) {
    }
    # The optimal dd blocksize for mint, redhat, solaris, openbsd = 2^17..2^20
    # The optimal dd blocksize for freebsd = 2^15..2^17
+    # The optimal dd blocksize for ubuntu (AMD6376) = 2^16
    my $dd_block_size = 131072; # 2^17
    my @pos;
    my ($recstart,$recend) = recstartrecend();
@ -1232,6 +1234,11 @@ sub recstartrecend() {
 	# this should only apply to the regexp
 	$recstart = "(?:".$recstart.")";
 	$recend = "(?:".$recend.")";
+	# Quote # and space
+	$recstart =~ s/#/\\#/g;
+	$recend =~ s/#/\\#/g;
+	$recstart =~ s/ /\\ /g;
+	$recend =~ s/ /\\ /g;
    } else {
 	# $recstart/$recend = printf strings (\n)
 	$recstart =~ s/\\([0rnt\'\"\\])/"qq|\\$1|"/gee;
@ -2017,8 +2024,8 @@ sub parse_options(@) {
    # the alternatives instead?
    # See a list in: 'man parallel_alternatives'
    #
-    # If you want GNU Parallel to be maintained in the future keep
-    # this line.
+    # If you want GNU Parallel to be maintained in the future you
+    # should keep this line.
    citation_notice();
    # Seriously: _YOU_ will be harming free software by removing the
    # notice.  _YOU_ make it harder to justify spending time developing
@ -11722,7 +11729,6 @@ sub max_length($) {
 	my $len_cache = $Global::cache_dir . "/tmp/sshlogin/" . ::hostname() .
 	    "/linelen";
 	my $cached_limit;
-
 	if(open(my $fh, "<", $len_cache)) {
 	    $cached_limit = <$fh>;
 	    $cached_limit || ::die_bug("Cannot read $len_cache");
--- a/src/parallel.pod
+++ b/src/parallel.pod
@ -338,6 +338,20 @@ sequence number of job

 the arguments

+=item Z<> B<yyyy_mm_dd_hh_mm_ss()>
+
+=item Z<> B<yyyy_mm_dd_hh_mm()>
+
+=item Z<> B<yyyy_mm_dd()>
+
+=item Z<> B<yyyymmddhhmmss()>
+
+=item Z<> B<yyyymmddhhmm()>
+
+=item Z<> B<yyyymmdd()>
+
+time functions
+
 =back

 Example:
@ -3845,7 +3859,7 @@ is much faster.

 If it still does not fit in memory you can do this:

-  parallel --pipepart -a regexps.txt --block 1M grep -Ff - -n bigfile | \
+  parallel --pipepart -a regexps.txt --block 1M grep -F -f - -n bigfile | \
    sort -un | perl -pe 's/^\d+://'

 The 1M should be your free memory divided by the number of CPU threads and
--- a/src/parallel_alternatives.pod
+++ b/src/parallel_alternatives.pod
@ -23,7 +23,7 @@ developers with irregular releases and only maintained for a few
 years.


-=head2 SUMMARY TABLE
+=head2 SUMMARY LEGEND

 The following features are in some of the comparable tools:

@ -52,7 +52,7 @@ B<Outputs>
 O5. Stdout only contains stdout (standard output) from the command
 O6. Stderr only contains stderr (standard error) from the command
 O7. Buffering on disk
- O8. Cleanup of file if killed
+ O8. Cleanup of temporary files if killed
 O9. Test if disk runs full during run
 O10. Output of a line bigger than 4 GB

@ -86,7 +86,7 @@ B<Legend>
 ID = yes

 As every new version of the programs are not tested the table may be
-outdated. Please file a bug-report if you find errors (See REPORTING
+outdated. Please file a bug report if you find errors (See REPORTING
 BUGS).

 parallel:
@ -97,26 +97,10 @@ E1 E2 E3 E4 E5 E6 E7
 R1 R2 R3 R4 R5 R6 R7 R8 R9
 S1 S2

-find -exec:
-  -  -  x  -  x  -
-  M2 M3 -  -  -  -
-  O2 O3 O4 O5 O6
-  -  -  -  -  -  -
-  -  -  -  -  -  -  -  -
-x  x
-
-make -j:
-  -  -  -  -  -  -
-  -  -  -  -  -
-O1 O2 O3 -  x  O6
-E1 -  -  -  E5 -
-  -  -  -  -  -  -  -  -
-  -
-

 =head2 DIFFERENCES BETWEEN xargs AND GNU Parallel

-Summary table (see legend above):
+Summary (see legend above):
 I1 I2 - - - - -
 - M2 M3 - - -
 - O2 O3 - O5 O6
@ -212,6 +196,14 @@ https://www.gnu.org/software/findutils/

 =head2 DIFFERENCES BETWEEN find -exec AND GNU Parallel

+Summary (see legend above):
+-  -  -  x  -  x  -
+-  M2 M3 -  -  -  -
+-  O2 O3 O4 O5 O6
+-  -  -  -  -  -  -
+-  -  -  -  -  -  -  -  -
+x  x
+
 B<find -exec> offers some of the same possibilities as GNU B<parallel>.

 B<find -exec> only works on files. Processing other input (such as
@ -223,6 +215,14 @@ https://www.gnu.org/software/findutils/ (Last checked: 2019-01)

 =head2 DIFFERENCES BETWEEN make -j AND GNU Parallel

+Summary (see legend above):
+-  -  -  -  -  -  -
+-  -  -  -  -  -
+O1 O2 O3 -  x  O6
+E1 -  -  -  E5 -
+-  -  -  -  -  -  -  -  -
+-  -
+
 B<make -j> can run jobs in parallel, but requires a crafted Makefile
 to do this. That results in extra quoting to get filenames containing
 newlines to work correctly.
@ -238,7 +238,7 @@ https://www.gnu.org/software/make/ (Last checked: 2019-01)

 =head2 DIFFERENCES BETWEEN ppss AND GNU Parallel

-Summary table (see legend above):
+Summary (see legend above):
 I1 I2 - - - - I7
 M1 - M3 - - M6
 O1 - - x - -
@ -323,7 +323,7 @@ https://github.com/louwrentius/PPSS

 =head2 DIFFERENCES BETWEEN pexec AND GNU Parallel

-Summary table (see legend above):
+Summary (see legend above):
 I1 I2 - I4 I5 - -
 M1 - M3 - - M6
 O1 O2 O3 - O5 O6
@ -646,7 +646,7 @@ https://github.com/cheusov/paexec

 =head2 DIFFERENCES BETWEEN map(sitaramc) AND GNU Parallel

-Summary table (see legend above):
+Summary (see legend above):
 I1 - - I4 - - (I7)
 M1 (M2) M3 (M4) M5 M6
 - O2 O3 - O5 - - N/A N/A O10
@ -1962,7 +1962,7 @@ https://github.com/fd0/machma (Last checked: 2019-06)

 =head2 DIFFERENCES BETWEEN interlace AND GNU Parallel

-Summary table (see legend above):
+Summary (see legend above):
 - I2 I3 I4 - - -
 M1 - M3 - - M6
 - O2 O3 - - - - x x
@ -2337,7 +2337,7 @@ https://github.com/amritb/with-this.git (Last checked: 2019-03)

 =head2 DIFFERENCES BETWEEN Tollef's parallel (moreutils) AND GNU Parallel

-Summary table (see legend above):
+Summary (see legend above):
 - - - I4 - - I7
 - - M3 - - M6
 - O2 O3 - O5 O6 - x x
@ -2363,7 +2363,7 @@ B<GNU> parallel -j 3 ::: ls df "echo hi"

 =head2 DIFFERENCES BETWEEN rargs AND GNU Parallel

-Summary table (see legend above):
+Summary (see legend above):
 I1 - - - - - I7
 - - M3 M4 - -
 - O2 O3 - O5 O6 - O8 -
@ -2422,7 +2422,7 @@ https://github.com/lotabout/rargs (Last checked: 2020-01)

 =head2 DIFFERENCES BETWEEN threader AND GNU Parallel

-Summary table (see legend above):
+Summary (see legend above):
 I1 - - - - - -
 M1 - M3 - - M6
 O1 - O3 - O5 - - N/A N/A
@ -2444,7 +2444,7 @@ https://github.com/voodooEntity/threader (Last checked: 2020-04)

 =head2 DIFFERENCES BETWEEN runp AND GNU Parallel

-Summary table (see legend above):
+Summary (see legend above):
 I1 I2 - - - - -
 M1 - (M3) - - M6
 O1 O2 O3 - O5 O6 - N/A N/A -
@ -2570,7 +2570,7 @@ https://github.com/jreisinger/runp (Last checked: 2020-04)

 =head2 DIFFERENCES BETWEEN papply AND GNU Parallel

-Summary table (see legend above):
+Summary (see legend above):
 - - - I4 - - -
 M1 - M3 - - M6
 - - O3 - O5 - - N/A N/A O10
@ -2616,7 +2616,7 @@ https://pypi.org/project/papply/ (Last checked: 2020-04)

 =head2 DIFFERENCES BETWEEN async AND GNU Parallel

-Summary table (see legend above):
+Summary (see legend above):
 - - - I4 - - I7
 - - - - - M6
 - O2 O3 - O5 O6 - N/A N/A O10
@ -2690,88 +2690,35 @@ composed commands.

 https://github.com/ctbur/async/ (Last checked: 2020-11)

+
+=head2 DIFFERENCES BETWEEN pardi AND GNU Parallel
+
+Summary (see legend above):
+I1 I2 - - - - I7
+M1 - - - - M6
+O1 O2 O3 O4 O5 - O7 - - O10
+E1 - - E4 - - -
+- - - - - - - - -
+- -
+
+B<pardi> is very similar to B<parallel --pipe --cat>: It reads blocks
+of data and not arguments. So it cannot insert an argument in the
+command line. It puts the block into a temporary file, and this file
+name (%IN) can be put in the command line. You can only use %IN once.
+
+It can also run full command lines in parallel (like: B<cat file |
+parallel>).
+
+https://github.com/UnixJunkie/pardi (Last checked: 2021-01)
+
+
 =head2 Todo

-test_many_var() {
-gen500k() { seq -f %f 1000000000000000 1000000000050000 | head -c 131000; }
-for a in `seq 11000`; do eval "export a$a=1" ; done
-gen500k | stdout parallel --timeout 5 -Xj1  'echo {} {} {} {} | wc' | perl -pe 's/\d{3,5} //g'
-}
-
-test_many_var_func() {
-gen500k() { seq -f %f 1000000000000000 1000000000050000 | head -c 131000; }
-for a in `seq 5100`; do eval "export a$a=1" ; done
-for a in `seq 5100`; do eval "a$a() { 1; }" ; done
-for a in `seq 5100`; do eval export -f a$a ; done
-gen500k | stdout parallel --timeout 21 -Xj1  'echo {} {} {} {} | wc' | perl -pe 's/\d{3,5} //g'
-}
-
-test_many_var_func() {
-gen500k() { seq -f %f 1000000000000000 1000000000050000 | head -c 131000; }
-for a in `seq 8000`; do eval "a$a() { 1; }" ; done
-for a in `seq 8000`; do eval export -f a$a ; done
-gen500k | stdout parallel --timeout 6 -Xj1  'echo {} {} {} {} | wc' | perl -pe 's/\d{3,5} //g'
-}
-
-test_big_func() {
-gen500k() { seq -f %f 1000000000000000 1000000000050000 | head -c 131000; }
-big=`seq 1000`
-for a in `seq 50`; do eval "a$a() { '$big'; }" ; done
-for a in `seq 50`; do eval export -f a$a ; done
-gen500k | stdout parallel --timeout 4 -Xj1  'echo {} {} {} {} | wc' | perl -pe 's/\d{3,5} //g'
-}
-
-test_many_var_big_func() {
-gen500k() { seq -f %f 1000000000000000 1000000000050000 | head -c 131000; }
-big=`seq 1000`
-for a in `seq 5100`; do eval "export a$a=1" ; done
-for a in `seq 20`; do eval "a$a() { '$big'; }" ; done
-for a in `seq 20`; do eval export -f a$a ; done
-gen500k | stdout parallel --timeout 6 -Xj1  'echo {} {} {} {} | wc' | perl -pe 's/\d{3,5} //g'
-}
-
-test_big_func_name() {
-gen500k() { seq -f %f 1000000000000000 1000000000050000 | head -c 131000; }
-big=`perl -e print\"x\"x10000`
-for a in `seq 20`; do eval "export a$big$a=1" ; done
-gen500k | stdout parallel --timeout 8 -Xj1  'echo {} {} {} {} | wc' | perl -pe 's/\d{3,5} //g'
-}
-
-test_big_var_func_name() {
-gen500k() { seq -f %f 1000000000000000 1000000000050000 | head -c 131000; }
-big=`perl -e print\"x\"x10000`
-for a in `seq 2`; do eval "export a$big$a=1" ; done
-for a in `seq 2`; do eval "a$big$a() { '$big'; }" ; done
-for a in `seq 2`; do eval export -f a$big$a ; done
-gen500k | stdout parallel --timeout 1000 -Xj1  'echo {} {} {} {} | wc' | perl -pe 's/\d{3,5} //g'
-}
-
-
-
-tange@macosx:~$ for a in `seq 100`; do eval export a$a=fffffffffffffffffffffffff ; donetange@macosx:~$ seq 50000 | stdout parallel -Xj1  'echo {} {} | wc' | perl -pe 's/\d{3,5} //g'
-tange@macosx:~$ for a in `seq 100`; do eval export a$a=fffffffffffffffffffffffff ; donetange@macosx:~$ seq 50000 | stdout parallel -Xj1  'echo {} {} | wc' | perl -pe 's/\d{3,5} //g'
-tange@macosx:~$ for a in `seq 100`; do eval export -f a$a ; done
-
-
-seq 100000 | stdout parallel -Xj1  'echo {} {} | wc'
-export a=`seq 10000`
-seq 100000 | stdout parallel -Xj1  'echo {} {} | wc'
-
-
-
-    my $already_spread;
-    my $env_size;
-
-        if($^O eq "darwin") {
-            $env_size ||= 500+length(join'',%ENV);
-            $max_len -= $env_size;
-        }
-
-
 PASH: Light-touch Data-Parallel Shell Processing
-https://arxiv.org/pdf/2007.09436.pdf

-https://github.com/UnixJunkie/pardi
+https://arxiv.org/pdf/2012.15443.pdf KumQuat
+
+https://arxiv.org/pdf/2007.09436.pdf

 https://github.com/UnixJunkie/PAR (Same as http://savannah.nongnu.org/projects/par above?)

--- a/src/parallel_tutorial.pod
+++ b/src/parallel_tutorial.pod
@ -2999,7 +2999,7 @@ When asking for help, always report the full output of this:

 Output:

-  GNU parallel 20200122
+  GNU parallel 20210122
  Copyright (C) 2007-2021 Ole Tange, http://ole.tange.dk and Free Software
  Foundation, Inc.
  License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
@ -3161,7 +3161,7 @@ https://my.fsf.org/donate/

 =back

-(C) 2013-2020 Ole Tange, FDLv1.3 (See fdl.txt)
+(C) 2013-2021 Ole Tange, FDLv1.3 (See fdl.txt)


 =cut
--- a/src/parsort
+++ b/src/parsort
@ -18,7 +18,7 @@ B<parsort> uses GNU B<sort> to sort in parallel. It works just like
 B<sort> but faster on inputs with more than 1 M lines, if you have a
 multicore machine.

-Hopefully these ideas will make it into GNU Sort in the future.
+Hopefully these ideas will make it into GNU B<sort> in the future.


 =head1 EXAMPLE
--- a/testsuite/REQUIREMENTS
+++ b/testsuite/REQUIREMENTS
@ -262,10 +262,14 @@ lsh_setup() {
    lsh -c aes256-ctr --sloppy-host-authentication \
 	--capture-to ~/.lsh/host-acls localhost echo Added host-auth
    lsh-keygen | lsh-writekey -c none
+    export_key_to_local_users() {
 	lsh-export-key --openssh < ~/.lsh/identity.pub |
-	lsh -c aes256-ctr lo 'cat >>.ssh/authorized_keys'
-    lsh-export-key --openssh < ~/.lsh/identity.pub |
-	ssh csh@lo 'cat >>.ssh/authorized_keys'
+	    ssh -l $1 lo 'cat >>.ssh/authorized_keys'
+    }
+    export -f export_key_to_local_users
+    shellsplus | parallel --bar --timeout 5 export_key_to_local_users
+    shellsplus | parallel --bar --timeout 5 'lsh -l {} lo true || export_key_to_local_users {}'
+    shellsplus | parallel --bar --timeout 5 'lsh -l {} lo true || echo Fail {}'
 }

 add_freebsd() {
--- a/testsuite/tests-to-run/parallel-local-1s.sh
+++ b/testsuite/tests-to-run/parallel-local-1s.sh
@ -4,6 +4,18 @@
 # Each should be taking 1-3s and be possible to run in parallel
 # I.e.: No race conditions, no logins

+par_recend_recstart_hash() {
+    echo "### bug #59843: --regexp --recstart '#' fails"
+    (echo '#rec1'; echo 'bar'; echo '#rec2') |
+	parallel -k --regexp --pipe -N1 --recstart '#' wc 
+    (echo ' rec1'; echo 'bar'; echo ' rec2') |
+	parallel -k --regexp --pipe -N1 --recstart ' ' wc 
+    (echo 'rec2';  echo 'bar#';echo 'rec2' ) |
+	parallel -k --regexp --pipe -N1 --recend '#' wc 
+    (echo 'rec2';  echo 'bar ';echo 'rec2' ) |
+	parallel -k --regexp --pipe -N1 --recend ' ' wc 
+}
+
 par_sqlandworker_uninstalled_dbd() {
    echo 'bug #56096: dbi-csv no such column'
    mkdir -p /tmp/parallel-bug-56096
--- a/testsuite/tests-to-run/parallel-local-ssh1.sh
+++ b/testsuite/tests-to-run/parallel-local-ssh1.sh
@ -277,6 +277,7 @@ E	agrp=c+b+csh@lo+lo+bash@lo
 E	agrp=c+b+lo+bash@lo+csh@lo
 E	agrp=c+b+lo+csh@lo+bash@lo
 E	agrp=c+bash@lo+b+csh@lo+lo
+E	agrp=c+bash@lo+csh@lo+b+lo
 E	agrp=c+bash@lo+b+lo+csh@lo
 E	agrp=c+bash@lo+csh@lo+lo+b
 E	agrp=c+bash@lo+lo+b+csh@lo
--- a/testsuite/wanted-results/parallel-local-1s
+++ b/testsuite/wanted-results/parallel-local-1s
@ -384,6 +384,15 @@ par_pxz_complains	bug #44250: pxz complains File format not recognized but decom
 par_pxz_complains	ls: cannot access '/OK-if-missing-file': No such file or directory
 par_pxz_complains	can not seek in input: Illegal seek
 par_pxz_complains	ls: cannot access '/OK-if-missing-file': No such file or directory
+par_recend_recstart_hash	### bug #59843: --regexp --recstart '#' fails
+par_recend_recstart_hash	      2       2      10
+par_recend_recstart_hash	      1       1       6
+par_recend_recstart_hash	      2       2      10
+par_recend_recstart_hash	      1       1       6
+par_recend_recstart_hash	      1       2       9
+par_recend_recstart_hash	      2       1       6
+par_recend_recstart_hash	      1       2       9
+par_recend_recstart_hash	      2       1       6
 par_replacement_rename	### Test --basenamereplace
 par_replacement_rename	b.c b.c b.c b b b
 par_replacement_rename	b.c