From c9488299dc56d3305ef47eb9bcb8d653b0b676b1 Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Tue, 27 Mar 2018 18:50:53 +0200 Subject: [PATCH] transpose: Race condition bug + testing. --- Makefile | 19 +++- transpose/test.sh | 57 +++++++++++ transpose/transpose | 194 ++++++++++++++++++++++++++++++++++++- transpose/transpose-simple | 9 -- transpose/transpose.pod | 175 --------------------------------- transpose/transposewrap.pl | 80 --------------- 6 files changed, 260 insertions(+), 274 deletions(-) create mode 100644 transpose/test.sh delete mode 100644 transpose/transpose-simple delete mode 100644 transpose/transpose.pod delete mode 100755 transpose/transposewrap.pl diff --git a/Makefile b/Makefile index 3ce2970..4e03d61 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,19 @@ CMD = blink bsearch duplicate-packets em encdir field forever G \ -gitnext gitundo goodpasswd histogram mtrr mirrorpdf neno off pdfman \ -puniq ramusage rand rclean rina rn rrm shython sound-reload stdout \ -swapout T timestamp tracefile upsidedown w4it-for-port-open \ -wifi-reload wssh ytv yyyymmdd + gitnext gitundo goodpasswd histogram mtrr mirrorpdf neno off \ + pdfman puniq ramusage rand rclean rina rn rrm shython \ + sound-reload stdout swapout T timestamp tracefile transpose \ + upsidedown w4it-for-port-open wifi-reload wssh ytv yyyymmdd -all: blink/blink.1 bsearch/bsearch.1 encdir/encdir.1 G/G.1 gitnext/gitnext.1 gitundo/gitundo.1 goodpasswd/goodpasswd.1 histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1 puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 shython/shython.1 sound-reload/sound-reload.1 stdout/stdout.1 timestamp/timestamp.1 tracefile/tracefile.1 T/T.1 upsidedown/upsidedown.1 wifi-reload/wifi-reload.1 wssh/wssh.1 ytv/ytv.1 yyyymmdd/yyyymmdd.1 +all: blink/blink.1 bsearch/bsearch.1 encdir/encdir.1 G/G.1 \ + gitnext/gitnext.1 gitundo/gitundo.1 goodpasswd/goodpasswd.1 \ + histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \ + off/off.1 pdfman/pdfman.1 puniq/puniq.1 rand/rand.1 \ + rina/rina.1 rn/rn.1 rrm/rrm.1 shython/shython.1 \ + sound-reload/sound-reload.1 stdout/stdout.1 \ + timestamp/timestamp.1 tracefile/tracefile.1 \ + transpose/transpose.1 T/T.1 upsidedown/upsidedown.1 \ + wifi-reload/wifi-reload.1 wssh/wssh.1 ytv/ytv.1 \ + yyyymmdd/yyyymmdd.1 %.1: % pod2man $< > $@ diff --git a/transpose/test.sh b/transpose/test.sh new file mode 100644 index 0000000..4cf6aa2 --- /dev/null +++ b/transpose/test.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +make_csv() { + # Create XXXsepYYY.csv (XXX rows, YYY cols, sep as separator) + normal() { + perl -e '($m,$sep,$n) = $ARGV[0]=~/(\d+)(\D+)(\d+)/; $sep = eval "\"$sep\""; for $l (1..$m){ print join $sep, map {"$_-$l"} (1..$n); print "\n" }' $@ > $@ + } + transposed() { + perl -e '($m,$sep,$n) = $ARGV[0]=~/(\d+)(\D+)(\d+)/; $sep = eval "\"$sep\""; for $l (1..$n){ print join $sep, map {"$l-$_"} (1..$m); print "\n" }' $@ > $@.t + } + export -f normal transposed + parallel -q {} "$@" ::: normal transposed +} + +md5transpose() { + local file + file=$1 + blk="$2 $3" + echo File $file + transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" $file | md5sum + cat $file | + transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" | md5sum + cat $file.t | md5sum + + transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" $file.t | md5sum + cat $file.t | + transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" | md5sum + cat $file | md5sum +} + +dotest() { + if [ ! -e "$1".t ] ; then + make_csv "$1" + fi + md5transpose "$@" + echo +} + +. `which env_parallel.bash` +env_parallel -r < [-d I] [-b I] [I] + +=head1 DESCRIPTION + +B will read a CSV fie + +=head1 OPTIONS + +=over 9 + +=item I + +Input CSV file. If none is given reads from STDIN (standard input). + + +=item B<-d> I + +Use I as delimiter in input and output. + + +=item B<-b> I + +Pass chunks of I bytes to the internal transposer. Memory +usage will be 10 times I per CPU core. Default is 10M. + + +=back + + +=head1 EXAMPLES + +=head2 EXAMPLE: Transpose a medium sized TSV file + + cat medium.tsv | transpose -d '\t' > muidem.tsv + +=head1 DESIGN + +B is designed to deal efficiently with medium sized data +(up to 30 TB per file) on systems with 100 MB RAM per CPU core. It +works by chopping the input into 10 MB blocks. Each block is +transposed in parallel and saved to disk. Then these files are pasted +together and finally removed. + +=head1 REPORTING BUGS + +Report bugs to . + + +=head1 AUTHOR + +Copyright (C) 2013-2018 Ole Tange, http://ole.tange.dk and Free +Software Foundation, Inc. + + +=head1 LICENSE + +Copyright (C) 2013 Free Software Foundation, Inc. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +at your option any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +=head2 Documentation license I + +Permission is granted to copy, distribute and/or modify this documentation +under the terms of the GNU Free Documentation License, Version 1.3 or +any later version published by the Free Software Foundation; with no +Invariant Sections, with no Front-Cover Texts, and with no Back-Cover +Texts. A copy of the license is included in the file fdl.txt. + +=head2 Documentation license II + +You are free: + +=over 9 + +=item B + +to copy, distribute and transmit the work + +=item B + +to adapt the work + +=back + +Under the following conditions: + +=over 9 + +=item B + +You must attribute the work in the manner specified by the author or +licensor (but not in any way that suggests that they endorse you or +your use of the work). + +=item B + +If you alter, transform, or build upon this work, you may distribute +the resulting work only under the same, similar or a compatible +license. + +=back + +With the understanding that: + +=over 9 + +=item B + +Any of the above conditions can be waived if you get permission from +the copyright holder. + +=item B + +Where the work or any of its elements is in the public domain under +applicable law, that status is in no way affected by the license. + +=item B + +In no way are any of the following rights affected by the license: + +=over 2 + +=item * + +Your fair dealing or fair use rights, or other applicable +copyright exceptions and limitations; + +=item * + +The author's moral rights; + +=item * + +Rights other persons may have either in the work itself or in +how the work is used, such as publicity or privacy rights. + +=back + +=back + +=over 9 + +=item B + +For any reuse or distribution, you must make clear to others the +license terms of this work. + +=back + +A copy of the full license is included in the file as cc-by-sa.txt. + +=head1 DEPENDENCIES + +B uses Perl, B, B and B. + + +=head1 SEE ALSO + +B(1), B(1), B(1) + +=cut + + # transpose [-d delimiter] [-b blocksize] table.csv > transposed.csv # cat table.csv | transpose [-d delimiter] [-b blocksize] > transposed.csv @@ -88,23 +271,24 @@ super_paste() { # basename fifo=`tempfile` rm $fifo - cat > $paste_files + # Group files from stdin in groups of 1000 files + parallel -k -n1000 echo > $paste_files # Define replacement string {0#} to 0-pad job number - PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10))); + export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10))); $_=sprintf("%0${f}d",seq())'\' # Make fifos that can be read from - cat $paste_files | parallel -n1000 "rm -f $fifo{0#}; mkfifo $fifo{0#}" + cat $paste_files | parallel "rm -f $fifo{0#}; mkfifo $fifo{0#}" # Start a paste process for every 1000 files - cat $paste_files | parallel -n1000 -j0 "paste -d '$sep' {} > $fifo{0#}" & + cat $paste_files | parallel -j0 "eval paste -d \''$sep'\' {} > $fifo{0#}" & # Paste all the fifos eval paste -d "'$sep'" $fifo* # Cleanup - cat $paste_files | parallel -n1000 "rm -f {} $fifo{0#}" + cat $paste_files | parallel "eval rm -f {} $fifo{0#}" rm $paste_files } diff --git a/transpose/transpose-simple b/transpose/transpose-simple deleted file mode 100644 index baafa2b..0000000 --- a/transpose/transpose-simple +++ /dev/null @@ -1,9 +0,0 @@ -Can it be done more simple? - -zcat D.gz | perl -ne 's/\s+/\n/g; open(OUT,">","out".(++$out)); print OUT' ; paste out* | pigz >Dt.gz - -Chop CSV into fields - -multi file paste - -paste out1 out2 | paste - out3 diff --git a/transpose/transpose.pod b/transpose/transpose.pod deleted file mode 100644 index 100c46c..0000000 --- a/transpose/transpose.pod +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/perl - -=head1 NAME - -transpose - transpose CSV file - -=head1 SYNOPSIS - -B [-d I] [I] - -=head1 DESCRIPTION - -B will read a CSV fie - -=head1 OPTIONS - -=over 9 - -=item I - -Input CSV file. If none is given reads from STDIN (standard input). - - -=item B<-d> I - not implemented - -Use I as delimiter in input and output. - - -=back - - -=head1 EXAMPLES - -=head2 EXAMPLE: Transpose a big CSV file - - cat medium.csv | transpose > muidem.csv - -=head1 DESIGN - -B is designed to deal efficiently with medium sized data -(up to 30 TB per file) on systems with 250 MB RAM per CPU core. It -works by chopping the input into 30 MB blocks. Each block is -transposed in parallel and saved to disk. Then these files are pasted -together and finally removed. - -=head1 REPORTING BUGS - -Report bugs to . - - -=head1 AUTHOR - -Copyright (C) 2013 Ole Tange, http://ole.tange.dk and Free -Software Foundation, Inc. - - -=head1 LICENSE - -Copyright (C) 2013 Free Software Foundation, Inc. - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3 of the License, or -at your option any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . - -=head2 Documentation license I - -Permission is granted to copy, distribute and/or modify this documentation -under the terms of the GNU Free Documentation License, Version 1.3 or -any later version published by the Free Software Foundation; with no -Invariant Sections, with no Front-Cover Texts, and with no Back-Cover -Texts. A copy of the license is included in the file fdl.txt. - -=head2 Documentation license II - -You are free: - -=over 9 - -=item B - -to copy, distribute and transmit the work - -=item B - -to adapt the work - -=back - -Under the following conditions: - -=over 9 - -=item B - -You must attribute the work in the manner specified by the author or -licensor (but not in any way that suggests that they endorse you or -your use of the work). - -=item B - -If you alter, transform, or build upon this work, you may distribute -the resulting work only under the same, similar or a compatible -license. - -=back - -With the understanding that: - -=over 9 - -=item B - -Any of the above conditions can be waived if you get permission from -the copyright holder. - -=item B - -Where the work or any of its elements is in the public domain under -applicable law, that status is in no way affected by the license. - -=item B - -In no way are any of the following rights affected by the license: - -=over 2 - -=item * - -Your fair dealing or fair use rights, or other applicable -copyright exceptions and limitations; - -=item * - -The author's moral rights; - -=item * - -Rights other persons may have either in the work itself or in -how the work is used, such as publicity or privacy rights. - -=back - -=back - -=over 9 - -=item B - -For any reuse or distribution, you must make clear to others the -license terms of this work. - -=back - -A copy of the full license is included in the file as cc-by-sa.txt. - -=head1 DEPENDENCIES - -B uses Perl, B, B and B. - - -=head1 SEE ALSO - -B(1), B(1), B(1) - -=cut - diff --git a/transpose/transposewrap.pl b/transpose/transposewrap.pl deleted file mode 100755 index 34320a7..0000000 --- a/transpose/transposewrap.pl +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/perl - -use File::Temp qw(tempfile tempdir); - -#$Global::debug = 1; -my $block = "30m"; -debug("parallel --pipe --block $block -k --files -j150% transpose-par.pl\n"); -my @files = `parallel --pipe --block $block -k --files -j150% transpose-par.pl`; -chomp(@files); -my $tmp = File::Temp::tempdir(CLEANUP => 1); -my $fifo = "$tmp/0000000"; -my $cmd = "mkfifo $fifo; paste > $fifo "; -my (@fifos, @args); -my $args_len = 0; -my $max_line_length_allowed = `parallel --max-line-length-allowed`; - -while(@files) { - push @args, shift @files; - $args_len += length $args[$#args] + 1; - if(length $cmd + $args_len > $max_line_length_allowed) { - unshift @files, pop @args; - push @fifos, $fifo; - if(fork()) { - } else { - debug("($cmd @args &)\n"); - `($cmd @args &)`; - exit($?); - } - $fifo++; - $cmd = "mkfifo $fifo; paste > $fifo "; - @args = (); - $args_len = 0; - } -} - -if(@args) { - push @fifos, $fifo; - if(fork()) { - } else { - debug("($cmd @args &)\n"); - `($cmd @args &)`; - exit($?); - } -} - -# make sure all fifos are created by the spawned shells -my @non_existing_fifos = @fifos; -while(@non_existing_fifos) { - if(not -e $non_existing_fifos[0]) { - usleep(1); - } else { - shift @non_existing_fifos; - } -} - -debug("paste @fifos\n"); -system("paste @fifos"); - -unlink(@fifos); -rmdir($tmp); - -sub usleep { - # Sleep this many milliseconds. - my $secs = shift; - ::debug(int($secs),"ms "); - select(undef, undef, undef, $secs/1000); -} - -sub debug { - # Returns: N/A - $Global::debug or return; - @_ = grep { defined $_ ? $_ : "" } @_; - if($Global::fd{1}) { - # Original stdout was saved - my $stdout = $Global::fd{1}; - print $stdout @_; - } else { - print @_; - } -}