parallel: --skip-first-line for --pipe(part).

2024-12-22 12:47:54 +00:00 · 2021-10-09 22:29:36 +02:00 · 2021-10-09 22:29:36 +02:00 · ccc54495bd
parent 072897c567
commit ccc54495bd
5 changed files with 74 additions and 17 deletions
--- a/15
+++ b/15
@ -74,8 +74,9 @@ run() {
 		# OK
 		return 0
 	    else
-		keyservers="pgp.surf.nl
-   			    keyserver.bazon.ru
+		keyservers="keyserver.ubuntu.com
+			    pgp.surf.nl
+			    keyserver.bazon.ru
 			    agora.cenditel.gob.ve
 			    pgp.benny-baumann.de"
 		for keyserver in $keyservers ; do
@ -89,7 +90,7 @@ run() {
 		echo "Cannot fetch keyID 0x88888888, so the signature cannot be checked."
 		return 1
 	    fi
-  	else
+	else
 	    # GnuPG not installed
 	    echo
 	    echo "GnuPG (gpg) is not installed so the signature cannot be checked."
@ -103,7 +104,7 @@ run() {
 	       perl -e 'exit not grep /^Primary key fingerprint: BE9C B493 81DE 3166 A3BC..66C1 2C62 29E2 FFFF FFF1|^Primary key fingerprint: CDA0 1A42 08C4 F745 0610..7E7B D1AB 4516 8888 8888/, <>'; then
 	    # Source code signed by Ole Tange <ole@tange.dk>
 	    # KeyID FFFFFFF1/88888888
-     	    true
+	    true
 	else
 	    # GnuPG signature failed
 	    echo
@ -116,8 +117,8 @@ run() {
    else
 	# GnuPG not installed or public keys not downloaded
 	echo "This means that if the code has been changed by criminals, you will not discover that!"
-     	echo
-       	echo "Continue anyway? (y/n)"
+	echo
+	echo "Continue anyway? (y/n)"
 	read YN </dev/tty
 	if test "$YN" = "n"; then
 	    # Stop
@ -148,7 +149,7 @@ run() {
 	# Is $HOME/bin already in $PATH?
 	if echo "$PATH" | grep "$HOME"/bin >/dev/null; then
 	    # $HOME/bin is already in $PATH
-       	    true
+	    true
 	else
 	    # Add $HOME/bin to $PATH for both bash and csh
 	    echo 'PATH=$PATH:$HOME/bin' >> "$HOME"/.bashrc
--- a/doc/haikus
+++ b/doc/haikus
@ -4,6 +4,9 @@
 
 Quote of the month:

+  GNU Parallelめっちゃ便利で偉い
+    -- аiгbus @airbus_P
+
  I really liked GNU Parallel http://gnu.org/software/parallel/
  one of the best tool to execute parallel jobs in the shell
    -- Luca Molteni @volothamp@twitter
--- a/src/parallel
+++ b/src/parallel
@ -566,14 +566,26 @@ sub pipe_part_files(@) {
 		"$file is not a seekable file.");
 	::wait_and_exit(255);
    }
-    my $header = find_header(\$buf,open_or_exit($file));
+
+    my $fh = open_or_exit($file);
+    my $firstlinelen = 0;
+    if($opt::skip_first_line) {
+	my $newline;
+	# Read a full line one byte at a time
+	while($firstlinelen += sysread($fh,$newline,1,0)) {
+	    $newline eq "\n" and last;
+	}
+    }
+    my $header = find_header(\$buf,$fh);
    # find positions
-    my @pos = find_split_positions($file,int($Global::blocksize),$header);
+    my @pos = find_split_positions($file,int($Global::blocksize),
+				   $header,$firstlinelen);
    # Make @cat_prepends
    my @cat_prepends = ();
    for(my $i=0; $i<$#pos; $i++) {
 	push(@cat_prepends,
-	     cat_partial($file, 0, length($header), $pos[$i], $pos[$i+1]));
+	     cat_partial($file, $firstlinelen, $firstlinelen+length($header),
+			 $pos[$i], $pos[$i+1]));
    }
    return @cat_prepends;
 }
@ -618,8 +630,8 @@ sub find_split_positions($$$) {
    #   $opt::recend
    # Returns:
    #   @positions of block start/end
-    my($file, $block, $header) = @_;
-    my $headerlen = length $header;
+    my($file, $block, $header, $firstlinelen) = @_;
+    my $skiplen = $firstlinelen + length $header;
    my $size = -s $file;
    if(-b $file) {
 	# $file is a blockdevice
@ -627,7 +639,8 @@ sub find_split_positions($$$) {
    }
    $block = int $block;
    if($opt::groupby) {
-	return split_positions_for_group_by($file,$size,$block,$header);
+	return split_positions_for_group_by($file,$size,$block,
+					    $header,$firstlinelen);
    }
    # The optimal dd blocksize for mint, redhat, solaris, openbsd = 2^17..2^20
    # The optimal dd blocksize for freebsd = 2^15..2^17
@ -637,8 +650,8 @@ sub find_split_positions($$$) {
    my ($recstart,$recend) = recstartrecend();
    my $recendrecstart = $recend.$recstart;
    my $fh = ::open_or_exit($file);
-    push(@pos,$headerlen);
-    for(my $pos = $block+$headerlen; $pos < $size; $pos += $block) {
+    push(@pos,$skiplen);
+    for(my $pos = $block+$skiplen; $pos < $size; $pos += $block) {
 	my $buf;
 	if($recendrecstart eq "") {
 	    # records ends anywhere
@ -745,14 +758,14 @@ sub split_positions_for_group_by($$$$) {
 	return($v,$vpos);
    }

-    my ($file,$size,$block,$header) = @_;
+    my ($file,$size,$block,$header,$firstlinelen) = @_;
    my ($a,$b,$c,$apos,$bpos,$cpos);
    my @pos;
    $fh = open_or_exit($file);
    # Set $Global::group_by_column $Global::group_by_perlexpr
    group_by_loop($fh,$opt::recsep);
    # $xpos = linestart, $x = value at $xpos, $apos < $bpos < $cpos
-    $apos = length $header;
+    $apos = $firstlinelen + length $header;
    for(($a,$apos) = value_at($apos); $apos < $size;) {
 	push @pos, $apos;
 	$bpos = $apos + $block;
@ -967,6 +980,13 @@ sub spreadstdin() {
    my $in = *STDIN;
    my $timeout = $Global::blocktimeout;

+    if($opt::skip_first_line) {
+	my $newline;
+	# Read a full line one byte at a time
+	while(sysread($in,$newline,1,0)) {
+	    $newline eq "\n" and last;
+	}
+    }
    my $header = find_header(\$buf,$in);
    my $anything_written;
    my $eof;
--- a/testsuite/tests-to-run/parallel-local-1s.sh
+++ b/testsuite/tests-to-run/parallel-local-1s.sh
@ -8,6 +8,14 @@
 # Each should be taking 1-3s and be possible to run in parallel
 # I.e.: No race conditions, no logins

+par_skip_first_line() {
+    tmpdir=$(mktemp)
+    (echo `seq 10000`;echo MyHeader; seq 10) |
+	parallel -k --skip-first-line --pipe --block 10 --header '1' cat
+    (echo `seq 10000`;echo MyHeader; seq 10) > "$tmpdir"
+    parallel -k --skip-first-line --pipepart -a "$tmpdir" --block 10 --header '1' cat
+}
+
 par_long_input() {
    echo '### Long input lines should not fail if they are not used'
    longline_tsv() {
--- a/testsuite/wanted-results/parallel-local-1s
+++ b/testsuite/wanted-results/parallel-local-1s
@ -838,6 +838,31 @@ par_seqreplace_long_line	### Test --seqreplace and line too long
 par_seqreplace_long_line	      9       1       1     101
 par_seqreplace_long_line	     90       1       1     201
 par_seqreplace_long_line	      1 parallel: Error: Command line too long (309 >= 210) at input 0: 100
+par_skip_first_line	MyHeader
+par_skip_first_line	1
+par_skip_first_line	2
+par_skip_first_line	3
+par_skip_first_line	4
+par_skip_first_line	5
+par_skip_first_line	MyHeader
+par_skip_first_line	6
+par_skip_first_line	7
+par_skip_first_line	8
+par_skip_first_line	9
+par_skip_first_line	MyHeader
+par_skip_first_line	10
+par_skip_first_line	MyHeader
+par_skip_first_line	1
+par_skip_first_line	2
+par_skip_first_line	3
+par_skip_first_line	4
+par_skip_first_line	5
+par_skip_first_line	6
+par_skip_first_line	MyHeader
+par_skip_first_line	7
+par_skip_first_line	8
+par_skip_first_line	9
+par_skip_first_line	10
 par_sql_colsep	### SQL should add Vn columns for --colsep
 par_sql_colsep	/a/A/1/11/
 par_sql_colsep	/a/A/2/22/