transpose: Race condition bug + testing.

2018-03-27 18:50:53 +02:00 · 2018-03-27 18:50:53 +02:00 · c9488299dc
parent f6a34e1200
commit c9488299dc
6 changed files with 260 additions and 274 deletions
--- a/19
+++ b/19
@ -1,10 +1,19 @@
 CMD = blink bsearch duplicate-packets em encdir field forever G		\
-gitnext gitundo goodpasswd histogram mtrr mirrorpdf neno off pdfman	\
-puniq ramusage rand rclean rina rn rrm shython sound-reload stdout	\
-swapout T timestamp tracefile upsidedown w4it-for-port-open		\
-wifi-reload wssh ytv yyyymmdd
+	gitnext gitundo goodpasswd histogram mtrr mirrorpdf neno off	\
+	pdfman puniq ramusage rand rclean rina rn rrm shython		\
+	sound-reload stdout swapout T timestamp tracefile transpose	\
+	upsidedown w4it-for-port-open wifi-reload wssh ytv yyyymmdd

-all: blink/blink.1 bsearch/bsearch.1 encdir/encdir.1 G/G.1 gitnext/gitnext.1 gitundo/gitundo.1 goodpasswd/goodpasswd.1 histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1 puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 shython/shython.1 sound-reload/sound-reload.1 stdout/stdout.1 timestamp/timestamp.1 tracefile/tracefile.1 T/T.1 upsidedown/upsidedown.1 wifi-reload/wifi-reload.1 wssh/wssh.1 ytv/ytv.1 yyyymmdd/yyyymmdd.1
+all: blink/blink.1 bsearch/bsearch.1 encdir/encdir.1 G/G.1		\
+	gitnext/gitnext.1 gitundo/gitundo.1 goodpasswd/goodpasswd.1	\
+	histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1		\
+	off/off.1 pdfman/pdfman.1 puniq/puniq.1 rand/rand.1		\
+	rina/rina.1 rn/rn.1 rrm/rrm.1 shython/shython.1			\
+	sound-reload/sound-reload.1 stdout/stdout.1			\
+	timestamp/timestamp.1 tracefile/tracefile.1			\
+	transpose/transpose.1 T/T.1 upsidedown/upsidedown.1		\
+	wifi-reload/wifi-reload.1 wssh/wssh.1 ytv/ytv.1			\
+	yyyymmdd/yyyymmdd.1

 %.1: %
 	pod2man $< > $@
--- a/transpose/test.sh
+++ b/transpose/test.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+
+make_csv() {
+    # Create XXXsepYYY.csv (XXX rows, YYY cols, sep as separator)
+    normal() {
+	perl -e '($m,$sep,$n) = $ARGV[0]=~/(\d+)(\D+)(\d+)/; $sep = eval "\"$sep\""; for $l (1..$m){ print join $sep, map {"$_-$l"} (1..$n); print "\n" }' $@ > $@
+    }
+    transposed() {
+	perl -e '($m,$sep,$n) = $ARGV[0]=~/(\d+)(\D+)(\d+)/; $sep = eval "\"$sep\""; for $l (1..$n){ print join $sep, map {"$l-$_"} (1..$m); print "\n" }' $@ > $@.t
+    }
+    export -f normal transposed
+    parallel -q {} "$@" ::: normal transposed
+}
+
+md5transpose() {
+    local file
+    file=$1
+    blk="$2 $3"
+    echo File $file
+    transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" $file | md5sum
+    cat $file |
+	transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" | md5sum
+    cat $file.t | md5sum
+
+    transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" $file.t | md5sum
+    cat $file.t |
+	transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" | md5sum
+    cat $file | md5sum
+}
+
+dotest() {
+    if [ ! -e "$1".t ] ; then
+	make_csv "$1"
+    fi
+    md5transpose "$@"
+    echo
+}
+
+. `which env_parallel.bash`
+env_parallel -r <<EOF
+# Test --block 1 (problem with GNU Parallel < 20180422)
+dotest /tmp/table-3,1000.csv -b 1
+dotest /tmp/table-3,1000.csv
+dotest /tmp/table-3,10000.csv
+dotest /tmp/table-3,100000.csv
+dotest /tmp/table-3,1000000.csv
+dotest /tmp/table-3,10000000.csv
+
+dotest '/tmp/table-10\\t20.csv'
+dotest /tmp/table-10';'20.csv
+dotest '/tmp/table-100\\t200.csv'
+dotest /tmp/table-1,100.csv
+dotest /tmp/table-10,1000.csv
+dotest /tmp/table-100,10000.csv
+dotest /tmp/table-1000,100000.csv
+EOF
+
--- a/transpose/transpose
+++ b/transpose/transpose
@ -1,5 +1,188 @@
 #!/bin/bash

+: <<=cut
+=pod
+
+=head1 NAME
+
+transpose - transpose CSV file
+
+=head1 SYNOPSIS
+
+B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input>]
+
+=head1 DESCRIPTION
+
+B<transpose> will read a CSV fie
+
+=head1 OPTIONS
+
+=over 9
+
+=item I<input>
+
+Input CSV file. If none is given reads from STDIN (standard input).
+
+
+=item B<-d> I<delim>
+
+Use I<delim> as delimiter in input and output.
+
+
+=item B<-b> I<blocksize>
+
+Pass chunks of I<blocksize> bytes to the internal transposer. Memory
+usage will be 10 times I<blocksiz> per CPU core. Default is 10M.
+
+
+=back
+
+
+=head1 EXAMPLES
+
+=head2 EXAMPLE: Transpose a medium sized TSV file
+
+    cat medium.tsv | transpose -d '\t' > muidem.tsv
+
+=head1 DESIGN
+
+B<transpose> is designed to deal efficiently with medium sized data
+(up to 30 TB per file) on systems with 100 MB RAM per CPU core. It
+works by chopping the input into 10 MB blocks. Each block is
+transposed in parallel and saved to disk. Then these files are pasted
+together and finally removed.
+
+=head1 REPORTING BUGS
+
+Report bugs to <tange@gnu.org>.
+
+
+=head1 AUTHOR
+
+Copyright (C) 2013-2018 Ole Tange, http://ole.tange.dk and Free
+Software Foundation, Inc.
+
+
+=head1 LICENSE
+
+Copyright (C) 2013 Free Software Foundation, Inc.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+at your option any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+=head2 Documentation license I
+
+Permission is granted to copy, distribute and/or modify this documentation
+under the terms of the GNU Free Documentation License, Version 1.3 or
+any later version published by the Free Software Foundation; with no
+Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
+Texts.  A copy of the license is included in the file fdl.txt.
+
+=head2 Documentation license II
+
+You are free:
+
+=over 9
+
+=item B<to Share>
+
+to copy, distribute and transmit the work
+
+=item B<to Remix>
+
+to adapt the work
+
+=back
+
+Under the following conditions:
+
+=over 9
+
+=item B<Attribution>
+
+You must attribute the work in the manner specified by the author or
+licensor (but not in any way that suggests that they endorse you or
+your use of the work).
+
+=item B<Share Alike>
+
+If you alter, transform, or build upon this work, you may distribute
+the resulting work only under the same, similar or a compatible
+license.
+
+=back
+
+With the understanding that:
+
+=over 9
+
+=item B<Waiver>
+
+Any of the above conditions can be waived if you get permission from
+the copyright holder.
+
+=item B<Public Domain>
+
+Where the work or any of its elements is in the public domain under
+applicable law, that status is in no way affected by the license.
+
+=item B<Other Rights>
+
+In no way are any of the following rights affected by the license:
+
+=over 2
+
+=item *
+
+Your fair dealing or fair use rights, or other applicable
+copyright exceptions and limitations;
+
+=item *
+
+The author's moral rights;
+
+=item *
+
+Rights other persons may have either in the work itself or in
+how the work is used, such as publicity or privacy rights.
+
+=back
+
+=back
+
+=over 9
+
+=item B<Notice>
+
+For any reuse or distribution, you must make clear to others the
+license terms of this work.
+
+=back
+
+A copy of the full license is included in the file as cc-by-sa.txt.
+
+=head1 DEPENDENCIES
+
+B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
+
+
+=head1 SEE ALSO
+
+B<bash>(1), B<parallel>(1), B<paste>(1)
+
+=cut
+
+
 # transpose [-d delimiter] [-b blocksize] table.csv > transposed.csv
 # cat table.csv | transpose [-d delimiter] [-b blocksize] > transposed.csv

@ -88,23 +271,24 @@ super_paste() {
    # basename
    fifo=`tempfile`
    rm $fifo
-    cat > $paste_files
+    # Group files from stdin in groups of 1000 files
+    parallel -k -n1000 echo > $paste_files

    # Define replacement string {0#} to 0-pad job number
-    PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
+    export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
                    $_=sprintf("%0${f}d",seq())'\'

    # Make fifos that can be read from
-    cat $paste_files | parallel -n1000 "rm -f $fifo{0#}; mkfifo $fifo{0#}"
+    cat $paste_files | parallel "rm -f $fifo{0#}; mkfifo $fifo{0#}"

    # Start a paste process for every 1000 files
-    cat $paste_files | parallel -n1000 -j0 "paste -d '$sep' {} > $fifo{0#}" &
+    cat $paste_files | parallel -j0 "eval paste -d \''$sep'\' {} > $fifo{0#}" &

    # Paste all the fifos
    eval paste -d "'$sep'" $fifo*

    # Cleanup
-    cat $paste_files | parallel -n1000 "rm -f {} $fifo{0#}"
+    cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
    rm $paste_files
 }

--- a/transpose/transpose-simple
+++ b/transpose/transpose-simple
@ -1,9 +0,0 @@
-Can it be done more simple?
-
-zcat D.gz | perl -ne 's/\s+/\n/g; open(OUT,">","out".(++$out)); print OUT' ; paste out* | pigz >Dt.gz
-
-Chop CSV into fields
-
-multi file paste
-
-paste out1 out2 | paste - out3
--- a/transpose/transpose.pod
+++ b/transpose/transpose.pod
@ -1,175 +0,0 @@
-#!/usr/bin/perl
-
-=head1 NAME
-
-transpose - transpose CSV file
-
-=head1 SYNOPSIS
-
-B<transpose> [-d I<delim>] [I<input>]
-
-=head1 DESCRIPTION
-
-B<transpose> will read a CSV fie
-
-=head1 OPTIONS
-
-=over 9
-
-=item I<input>
-
-Input CSV file. If none is given reads from STDIN (standard input).
-
-
-=item B<-d> I<delim> - not implemented
-
-Use I<delim> as delimiter in input and output.
-
-
-=back
-
-
-=head1 EXAMPLES
-
-=head2 EXAMPLE: Transpose a big CSV file
-
-    cat medium.csv | transpose > muidem.csv
-
-=head1 DESIGN
-
-B<transpose> is designed to deal efficiently with medium sized data
-(up to 30 TB per file) on systems with 250 MB RAM per CPU core. It
-works by chopping the input into 30 MB blocks. Each block is
-transposed in parallel and saved to disk. Then these files are pasted
-together and finally removed.
-
-=head1 REPORTING BUGS
-
-Report bugs to <tange@gnu.org>.
-
-
-=head1 AUTHOR
-
-Copyright (C) 2013 Ole Tange, http://ole.tange.dk and Free
-Software Foundation, Inc.
-
-
-=head1 LICENSE
-
-Copyright (C) 2013 Free Software Foundation, Inc.
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 3 of the License, or
-at your option any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-=head2 Documentation license I
-
-Permission is granted to copy, distribute and/or modify this documentation
-under the terms of the GNU Free Documentation License, Version 1.3 or
-any later version published by the Free Software Foundation; with no
-Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
-Texts.  A copy of the license is included in the file fdl.txt.
-
-=head2 Documentation license II
-
-You are free:
-
-=over 9
-
-=item B<to Share>
-
-to copy, distribute and transmit the work
-
-=item B<to Remix>
-
-to adapt the work
-
-=back
-
-Under the following conditions:
-
-=over 9
-
-=item B<Attribution>
-
-You must attribute the work in the manner specified by the author or
-licensor (but not in any way that suggests that they endorse you or
-your use of the work).
-
-=item B<Share Alike>
-
-If you alter, transform, or build upon this work, you may distribute
-the resulting work only under the same, similar or a compatible
-license.
-
-=back
-
-With the understanding that:
-
-=over 9
-
-=item B<Waiver>
-
-Any of the above conditions can be waived if you get permission from
-the copyright holder.
-
-=item B<Public Domain>
-
-Where the work or any of its elements is in the public domain under
-applicable law, that status is in no way affected by the license.
-
-=item B<Other Rights>
-
-In no way are any of the following rights affected by the license:
-
-=over 2
-
-=item *
-
-Your fair dealing or fair use rights, or other applicable
-copyright exceptions and limitations;
-
-=item *
-
-The author's moral rights;
-
-=item *
-
-Rights other persons may have either in the work itself or in
-how the work is used, such as publicity or privacy rights.
-
-=back
-
-=back
-
-=over 9
-
-=item B<Notice>
-
-For any reuse or distribution, you must make clear to others the
-license terms of this work.
-
-=back
-
-A copy of the full license is included in the file as cc-by-sa.txt.
-
-=head1 DEPENDENCIES
-
-B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
-
-
-=head1 SEE ALSO
-
-B<bash>(1), B<parallel>(1), B<paste>(1)
-
-=cut
-
--- a/transpose/transposewrap.pl
+++ b/transpose/transposewrap.pl
@ -1,80 +0,0 @@
-#!/usr/bin/perl
-
-use File::Temp qw(tempfile tempdir);
-
-#$Global::debug = 1;
-my $block = "30m";
-debug("parallel --pipe --block $block -k --files -j150% transpose-par.pl\n");
-my @files = `parallel --pipe --block $block -k --files -j150% transpose-par.pl`;
-chomp(@files);
-my $tmp = File::Temp::tempdir(CLEANUP => 1);
-my $fifo = "$tmp/0000000";
-my $cmd = "mkfifo $fifo; paste > $fifo ";
-my (@fifos, @args);
-my $args_len = 0;
-my $max_line_length_allowed = `parallel --max-line-length-allowed`;
-
-while(@files) {
-    push @args, shift @files;
-    $args_len += length $args[$#args] + 1;
-    if(length $cmd + $args_len > $max_line_length_allowed) {
-	unshift @files, pop @args;
-	push @fifos, $fifo;
-	if(fork()) {
-	} else {
-	    debug("($cmd @args &)\n");
-	    `($cmd @args &)`;
-	    exit($?);
-	}
-	$fifo++;
-	$cmd = "mkfifo $fifo; paste > $fifo ";
-	@args = ();
-	$args_len = 0;
-    }
-}
-
-if(@args) {
-    push @fifos, $fifo;
-    if(fork()) {
-    } else {
-	debug("($cmd @args &)\n");
-	`($cmd @args &)`;
-	exit($?);
-    }
-}
-
-# make sure all fifos are created by the spawned shells
-my @non_existing_fifos = @fifos;
-while(@non_existing_fifos) {
-    if(not -e $non_existing_fifos[0]) {
-	usleep(1);
-    } else {
-	shift @non_existing_fifos;
-    }
-}
-
-debug("paste @fifos\n");
-system("paste @fifos");
-
-unlink(@fifos);
-rmdir($tmp);
-
-sub usleep {
-    # Sleep this many milliseconds.
-    my $secs = shift;
-    ::debug(int($secs),"ms ");
-    select(undef, undef, undef, $secs/1000);
-}
-
-sub debug {
-    # Returns: N/A
-    $Global::debug or return;
-    @_ = grep { defined $_ ? $_ : "" } @_;
-    if($Global::fd{1}) {
-	# Original stdout was saved
-	my $stdout = $Global::fd{1};
-        print $stdout @_;
-    } else {
-        print @_;
-    }
-}