transpose: Race condition bug + testing.

This commit is contained in:
Ole Tange 2018-03-27 18:50:53 +02:00
parent f6a34e1200
commit c9488299dc
6 changed files with 260 additions and 274 deletions

View file

@ -1,10 +1,19 @@
CMD = blink bsearch duplicate-packets em encdir field forever G \
gitnext gitundo goodpasswd histogram mtrr mirrorpdf neno off pdfman \
puniq ramusage rand rclean rina rn rrm shython sound-reload stdout \
swapout T timestamp tracefile upsidedown w4it-for-port-open \
wifi-reload wssh ytv yyyymmdd
gitnext gitundo goodpasswd histogram mtrr mirrorpdf neno off \
pdfman puniq ramusage rand rclean rina rn rrm shython \
sound-reload stdout swapout T timestamp tracefile transpose \
upsidedown w4it-for-port-open wifi-reload wssh ytv yyyymmdd
all: blink/blink.1 bsearch/bsearch.1 encdir/encdir.1 G/G.1 gitnext/gitnext.1 gitundo/gitundo.1 goodpasswd/goodpasswd.1 histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1 puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 shython/shython.1 sound-reload/sound-reload.1 stdout/stdout.1 timestamp/timestamp.1 tracefile/tracefile.1 T/T.1 upsidedown/upsidedown.1 wifi-reload/wifi-reload.1 wssh/wssh.1 ytv/ytv.1 yyyymmdd/yyyymmdd.1
all: blink/blink.1 bsearch/bsearch.1 encdir/encdir.1 G/G.1 \
gitnext/gitnext.1 gitundo/gitundo.1 goodpasswd/goodpasswd.1 \
histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \
off/off.1 pdfman/pdfman.1 puniq/puniq.1 rand/rand.1 \
rina/rina.1 rn/rn.1 rrm/rrm.1 shython/shython.1 \
sound-reload/sound-reload.1 stdout/stdout.1 \
timestamp/timestamp.1 tracefile/tracefile.1 \
transpose/transpose.1 T/T.1 upsidedown/upsidedown.1 \
wifi-reload/wifi-reload.1 wssh/wssh.1 ytv/ytv.1 \
yyyymmdd/yyyymmdd.1
%.1: %
pod2man $< > $@

57
transpose/test.sh Normal file
View file

@ -0,0 +1,57 @@
#!/bin/bash
make_csv() {
# Create XXXsepYYY.csv (XXX rows, YYY cols, sep as separator)
normal() {
perl -e '($m,$sep,$n) = $ARGV[0]=~/(\d+)(\D+)(\d+)/; $sep = eval "\"$sep\""; for $l (1..$m){ print join $sep, map {"$_-$l"} (1..$n); print "\n" }' $@ > $@
}
transposed() {
perl -e '($m,$sep,$n) = $ARGV[0]=~/(\d+)(\D+)(\d+)/; $sep = eval "\"$sep\""; for $l (1..$n){ print join $sep, map {"$l-$_"} (1..$m); print "\n" }' $@ > $@.t
}
export -f normal transposed
parallel -q {} "$@" ::: normal transposed
}
md5transpose() {
local file
file=$1
blk="$2 $3"
echo File $file
transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" $file | md5sum
cat $file |
transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" | md5sum
cat $file.t | md5sum
transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" $file.t | md5sum
cat $file.t |
transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" | md5sum
cat $file | md5sum
}
dotest() {
if [ ! -e "$1".t ] ; then
make_csv "$1"
fi
md5transpose "$@"
echo
}
. `which env_parallel.bash`
env_parallel -r <<EOF
# Test --block 1 (problem with GNU Parallel < 20180422)
dotest /tmp/table-3,1000.csv -b 1
dotest /tmp/table-3,1000.csv
dotest /tmp/table-3,10000.csv
dotest /tmp/table-3,100000.csv
dotest /tmp/table-3,1000000.csv
dotest /tmp/table-3,10000000.csv
dotest '/tmp/table-10\\t20.csv'
dotest /tmp/table-10';'20.csv
dotest '/tmp/table-100\\t200.csv'
dotest /tmp/table-1,100.csv
dotest /tmp/table-10,1000.csv
dotest /tmp/table-100,10000.csv
dotest /tmp/table-1000,100000.csv
EOF

View file

@ -1,5 +1,188 @@
#!/bin/bash
: <<=cut
=pod
=head1 NAME
transpose - transpose CSV file
=head1 SYNOPSIS
B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input>]
=head1 DESCRIPTION
B<transpose> will read a CSV fie
=head1 OPTIONS
=over 9
=item I<input>
Input CSV file. If none is given reads from STDIN (standard input).
=item B<-d> I<delim>
Use I<delim> as delimiter in input and output.
=item B<-b> I<blocksize>
Pass chunks of I<blocksize> bytes to the internal transposer. Memory
usage will be 10 times I<blocksiz> per CPU core. Default is 10M.
=back
=head1 EXAMPLES
=head2 EXAMPLE: Transpose a medium sized TSV file
cat medium.tsv | transpose -d '\t' > muidem.tsv
=head1 DESIGN
B<transpose> is designed to deal efficiently with medium sized data
(up to 30 TB per file) on systems with 100 MB RAM per CPU core. It
works by chopping the input into 10 MB blocks. Each block is
transposed in parallel and saved to disk. Then these files are pasted
together and finally removed.
=head1 REPORTING BUGS
Report bugs to <tange@gnu.org>.
=head1 AUTHOR
Copyright (C) 2013-2018 Ole Tange, http://ole.tange.dk and Free
Software Foundation, Inc.
=head1 LICENSE
Copyright (C) 2013 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head2 Documentation license I
Permission is granted to copy, distribute and/or modify this documentation
under the terms of the GNU Free Documentation License, Version 1.3 or
any later version published by the Free Software Foundation; with no
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
Texts. A copy of the license is included in the file fdl.txt.
=head2 Documentation license II
You are free:
=over 9
=item B<to Share>
to copy, distribute and transmit the work
=item B<to Remix>
to adapt the work
=back
Under the following conditions:
=over 9
=item B<Attribution>
You must attribute the work in the manner specified by the author or
licensor (but not in any way that suggests that they endorse you or
your use of the work).
=item B<Share Alike>
If you alter, transform, or build upon this work, you may distribute
the resulting work only under the same, similar or a compatible
license.
=back
With the understanding that:
=over 9
=item B<Waiver>
Any of the above conditions can be waived if you get permission from
the copyright holder.
=item B<Public Domain>
Where the work or any of its elements is in the public domain under
applicable law, that status is in no way affected by the license.
=item B<Other Rights>
In no way are any of the following rights affected by the license:
=over 2
=item *
Your fair dealing or fair use rights, or other applicable
copyright exceptions and limitations;
=item *
The author's moral rights;
=item *
Rights other persons may have either in the work itself or in
how the work is used, such as publicity or privacy rights.
=back
=back
=over 9
=item B<Notice>
For any reuse or distribution, you must make clear to others the
license terms of this work.
=back
A copy of the full license is included in the file as cc-by-sa.txt.
=head1 DEPENDENCIES
B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
=head1 SEE ALSO
B<bash>(1), B<parallel>(1), B<paste>(1)
=cut
# transpose [-d delimiter] [-b blocksize] table.csv > transposed.csv
# cat table.csv | transpose [-d delimiter] [-b blocksize] > transposed.csv
@ -88,23 +271,24 @@ super_paste() {
# basename
fifo=`tempfile`
rm $fifo
cat > $paste_files
# Group files from stdin in groups of 1000 files
parallel -k -n1000 echo > $paste_files
# Define replacement string {0#} to 0-pad job number
PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
$_=sprintf("%0${f}d",seq())'\'
# Make fifos that can be read from
cat $paste_files | parallel -n1000 "rm -f $fifo{0#}; mkfifo $fifo{0#}"
cat $paste_files | parallel "rm -f $fifo{0#}; mkfifo $fifo{0#}"
# Start a paste process for every 1000 files
cat $paste_files | parallel -n1000 -j0 "paste -d '$sep' {} > $fifo{0#}" &
cat $paste_files | parallel -j0 "eval paste -d \''$sep'\' {} > $fifo{0#}" &
# Paste all the fifos
eval paste -d "'$sep'" $fifo*
# Cleanup
cat $paste_files | parallel -n1000 "rm -f {} $fifo{0#}"
cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
rm $paste_files
}

View file

@ -1,9 +0,0 @@
Can it be done more simple?
zcat D.gz | perl -ne 's/\s+/\n/g; open(OUT,">","out".(++$out)); print OUT' ; paste out* | pigz >Dt.gz
Chop CSV into fields
multi file paste
paste out1 out2 | paste - out3

View file

@ -1,175 +0,0 @@
#!/usr/bin/perl
=head1 NAME
transpose - transpose CSV file
=head1 SYNOPSIS
B<transpose> [-d I<delim>] [I<input>]
=head1 DESCRIPTION
B<transpose> will read a CSV fie
=head1 OPTIONS
=over 9
=item I<input>
Input CSV file. If none is given reads from STDIN (standard input).
=item B<-d> I<delim> - not implemented
Use I<delim> as delimiter in input and output.
=back
=head1 EXAMPLES
=head2 EXAMPLE: Transpose a big CSV file
cat medium.csv | transpose > muidem.csv
=head1 DESIGN
B<transpose> is designed to deal efficiently with medium sized data
(up to 30 TB per file) on systems with 250 MB RAM per CPU core. It
works by chopping the input into 30 MB blocks. Each block is
transposed in parallel and saved to disk. Then these files are pasted
together and finally removed.
=head1 REPORTING BUGS
Report bugs to <tange@gnu.org>.
=head1 AUTHOR
Copyright (C) 2013 Ole Tange, http://ole.tange.dk and Free
Software Foundation, Inc.
=head1 LICENSE
Copyright (C) 2013 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head2 Documentation license I
Permission is granted to copy, distribute and/or modify this documentation
under the terms of the GNU Free Documentation License, Version 1.3 or
any later version published by the Free Software Foundation; with no
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
Texts. A copy of the license is included in the file fdl.txt.
=head2 Documentation license II
You are free:
=over 9
=item B<to Share>
to copy, distribute and transmit the work
=item B<to Remix>
to adapt the work
=back
Under the following conditions:
=over 9
=item B<Attribution>
You must attribute the work in the manner specified by the author or
licensor (but not in any way that suggests that they endorse you or
your use of the work).
=item B<Share Alike>
If you alter, transform, or build upon this work, you may distribute
the resulting work only under the same, similar or a compatible
license.
=back
With the understanding that:
=over 9
=item B<Waiver>
Any of the above conditions can be waived if you get permission from
the copyright holder.
=item B<Public Domain>
Where the work or any of its elements is in the public domain under
applicable law, that status is in no way affected by the license.
=item B<Other Rights>
In no way are any of the following rights affected by the license:
=over 2
=item *
Your fair dealing or fair use rights, or other applicable
copyright exceptions and limitations;
=item *
The author's moral rights;
=item *
Rights other persons may have either in the work itself or in
how the work is used, such as publicity or privacy rights.
=back
=back
=over 9
=item B<Notice>
For any reuse or distribution, you must make clear to others the
license terms of this work.
=back
A copy of the full license is included in the file as cc-by-sa.txt.
=head1 DEPENDENCIES
B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
=head1 SEE ALSO
B<bash>(1), B<parallel>(1), B<paste>(1)
=cut

View file

@ -1,80 +0,0 @@
#!/usr/bin/perl
use File::Temp qw(tempfile tempdir);
#$Global::debug = 1;
my $block = "30m";
debug("parallel --pipe --block $block -k --files -j150% transpose-par.pl\n");
my @files = `parallel --pipe --block $block -k --files -j150% transpose-par.pl`;
chomp(@files);
my $tmp = File::Temp::tempdir(CLEANUP => 1);
my $fifo = "$tmp/0000000";
my $cmd = "mkfifo $fifo; paste > $fifo ";
my (@fifos, @args);
my $args_len = 0;
my $max_line_length_allowed = `parallel --max-line-length-allowed`;
while(@files) {
push @args, shift @files;
$args_len += length $args[$#args] + 1;
if(length $cmd + $args_len > $max_line_length_allowed) {
unshift @files, pop @args;
push @fifos, $fifo;
if(fork()) {
} else {
debug("($cmd @args &)\n");
`($cmd @args &)`;
exit($?);
}
$fifo++;
$cmd = "mkfifo $fifo; paste > $fifo ";
@args = ();
$args_len = 0;
}
}
if(@args) {
push @fifos, $fifo;
if(fork()) {
} else {
debug("($cmd @args &)\n");
`($cmd @args &)`;
exit($?);
}
}
# make sure all fifos are created by the spawned shells
my @non_existing_fifos = @fifos;
while(@non_existing_fifos) {
if(not -e $non_existing_fifos[0]) {
usleep(1);
} else {
shift @non_existing_fifos;
}
}
debug("paste @fifos\n");
system("paste @fifos");
unlink(@fifos);
rmdir($tmp);
sub usleep {
# Sleep this many milliseconds.
my $secs = shift;
::debug(int($secs),"ms ");
select(undef, undef, undef, $secs/1000);
}
sub debug {
# Returns: N/A
$Global::debug or return;
@_ = grep { defined $_ ? $_ : "" } @_;
if($Global::fd{1}) {
# Original stdout was saved
my $stdout = $Global::fd{1};
print $stdout @_;
} else {
print @_;
}
}