transpose: Race condition bug + testing.
This commit is contained in:
parent
f6a34e1200
commit
c9488299dc
19
Makefile
19
Makefile
|
@ -1,10 +1,19 @@
|
|||
CMD = blink bsearch duplicate-packets em encdir field forever G \
|
||||
gitnext gitundo goodpasswd histogram mtrr mirrorpdf neno off pdfman \
|
||||
puniq ramusage rand rclean rina rn rrm shython sound-reload stdout \
|
||||
swapout T timestamp tracefile upsidedown w4it-for-port-open \
|
||||
wifi-reload wssh ytv yyyymmdd
|
||||
gitnext gitundo goodpasswd histogram mtrr mirrorpdf neno off \
|
||||
pdfman puniq ramusage rand rclean rina rn rrm shython \
|
||||
sound-reload stdout swapout T timestamp tracefile transpose \
|
||||
upsidedown w4it-for-port-open wifi-reload wssh ytv yyyymmdd
|
||||
|
||||
all: blink/blink.1 bsearch/bsearch.1 encdir/encdir.1 G/G.1 gitnext/gitnext.1 gitundo/gitundo.1 goodpasswd/goodpasswd.1 histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1 puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 shython/shython.1 sound-reload/sound-reload.1 stdout/stdout.1 timestamp/timestamp.1 tracefile/tracefile.1 T/T.1 upsidedown/upsidedown.1 wifi-reload/wifi-reload.1 wssh/wssh.1 ytv/ytv.1 yyyymmdd/yyyymmdd.1
|
||||
all: blink/blink.1 bsearch/bsearch.1 encdir/encdir.1 G/G.1 \
|
||||
gitnext/gitnext.1 gitundo/gitundo.1 goodpasswd/goodpasswd.1 \
|
||||
histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \
|
||||
off/off.1 pdfman/pdfman.1 puniq/puniq.1 rand/rand.1 \
|
||||
rina/rina.1 rn/rn.1 rrm/rrm.1 shython/shython.1 \
|
||||
sound-reload/sound-reload.1 stdout/stdout.1 \
|
||||
timestamp/timestamp.1 tracefile/tracefile.1 \
|
||||
transpose/transpose.1 T/T.1 upsidedown/upsidedown.1 \
|
||||
wifi-reload/wifi-reload.1 wssh/wssh.1 ytv/ytv.1 \
|
||||
yyyymmdd/yyyymmdd.1
|
||||
|
||||
%.1: %
|
||||
pod2man $< > $@
|
||||
|
|
57
transpose/test.sh
Normal file
57
transpose/test.sh
Normal file
|
@ -0,0 +1,57 @@
|
|||
#!/bin/bash
|
||||
|
||||
make_csv() {
|
||||
# Create XXXsepYYY.csv (XXX rows, YYY cols, sep as separator)
|
||||
normal() {
|
||||
perl -e '($m,$sep,$n) = $ARGV[0]=~/(\d+)(\D+)(\d+)/; $sep = eval "\"$sep\""; for $l (1..$m){ print join $sep, map {"$_-$l"} (1..$n); print "\n" }' $@ > $@
|
||||
}
|
||||
transposed() {
|
||||
perl -e '($m,$sep,$n) = $ARGV[0]=~/(\d+)(\D+)(\d+)/; $sep = eval "\"$sep\""; for $l (1..$n){ print join $sep, map {"$l-$_"} (1..$m); print "\n" }' $@ > $@.t
|
||||
}
|
||||
export -f normal transposed
|
||||
parallel -q {} "$@" ::: normal transposed
|
||||
}
|
||||
|
||||
md5transpose() {
|
||||
local file
|
||||
file=$1
|
||||
blk="$2 $3"
|
||||
echo File $file
|
||||
transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" $file | md5sum
|
||||
cat $file |
|
||||
transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" | md5sum
|
||||
cat $file.t | md5sum
|
||||
|
||||
transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" $file.t | md5sum
|
||||
cat $file.t |
|
||||
transpose $blk -d "$(echo "$file" | perl -pe 's/.*\d(\D+)\d.*/$1/')" | md5sum
|
||||
cat $file | md5sum
|
||||
}
|
||||
|
||||
dotest() {
|
||||
if [ ! -e "$1".t ] ; then
|
||||
make_csv "$1"
|
||||
fi
|
||||
md5transpose "$@"
|
||||
echo
|
||||
}
|
||||
|
||||
. `which env_parallel.bash`
|
||||
env_parallel -r <<EOF
|
||||
# Test --block 1 (problem with GNU Parallel < 20180422)
|
||||
dotest /tmp/table-3,1000.csv -b 1
|
||||
dotest /tmp/table-3,1000.csv
|
||||
dotest /tmp/table-3,10000.csv
|
||||
dotest /tmp/table-3,100000.csv
|
||||
dotest /tmp/table-3,1000000.csv
|
||||
dotest /tmp/table-3,10000000.csv
|
||||
|
||||
dotest '/tmp/table-10\\t20.csv'
|
||||
dotest /tmp/table-10';'20.csv
|
||||
dotest '/tmp/table-100\\t200.csv'
|
||||
dotest /tmp/table-1,100.csv
|
||||
dotest /tmp/table-10,1000.csv
|
||||
dotest /tmp/table-100,10000.csv
|
||||
dotest /tmp/table-1000,100000.csv
|
||||
EOF
|
||||
|
|
@ -1,5 +1,188 @@
|
|||
#!/bin/bash
|
||||
|
||||
: <<=cut
|
||||
=pod
|
||||
|
||||
=head1 NAME
|
||||
|
||||
transpose - transpose CSV file
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input>]
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
B<transpose> will read a CSV fie
|
||||
|
||||
=head1 OPTIONS
|
||||
|
||||
=over 9
|
||||
|
||||
=item I<input>
|
||||
|
||||
Input CSV file. If none is given reads from STDIN (standard input).
|
||||
|
||||
|
||||
=item B<-d> I<delim>
|
||||
|
||||
Use I<delim> as delimiter in input and output.
|
||||
|
||||
|
||||
=item B<-b> I<blocksize>
|
||||
|
||||
Pass chunks of I<blocksize> bytes to the internal transposer. Memory
|
||||
usage will be 10 times I<blocksiz> per CPU core. Default is 10M.
|
||||
|
||||
|
||||
=back
|
||||
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
=head2 EXAMPLE: Transpose a medium sized TSV file
|
||||
|
||||
cat medium.tsv | transpose -d '\t' > muidem.tsv
|
||||
|
||||
=head1 DESIGN
|
||||
|
||||
B<transpose> is designed to deal efficiently with medium sized data
|
||||
(up to 30 TB per file) on systems with 100 MB RAM per CPU core. It
|
||||
works by chopping the input into 10 MB blocks. Each block is
|
||||
transposed in parallel and saved to disk. Then these files are pasted
|
||||
together and finally removed.
|
||||
|
||||
=head1 REPORTING BUGS
|
||||
|
||||
Report bugs to <tange@gnu.org>.
|
||||
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Copyright (C) 2013-2018 Ole Tange, http://ole.tange.dk and Free
|
||||
Software Foundation, Inc.
|
||||
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
Copyright (C) 2013 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
at your option any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
=head2 Documentation license I
|
||||
|
||||
Permission is granted to copy, distribute and/or modify this documentation
|
||||
under the terms of the GNU Free Documentation License, Version 1.3 or
|
||||
any later version published by the Free Software Foundation; with no
|
||||
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
|
||||
Texts. A copy of the license is included in the file fdl.txt.
|
||||
|
||||
=head2 Documentation license II
|
||||
|
||||
You are free:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<to Share>
|
||||
|
||||
to copy, distribute and transmit the work
|
||||
|
||||
=item B<to Remix>
|
||||
|
||||
to adapt the work
|
||||
|
||||
=back
|
||||
|
||||
Under the following conditions:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Attribution>
|
||||
|
||||
You must attribute the work in the manner specified by the author or
|
||||
licensor (but not in any way that suggests that they endorse you or
|
||||
your use of the work).
|
||||
|
||||
=item B<Share Alike>
|
||||
|
||||
If you alter, transform, or build upon this work, you may distribute
|
||||
the resulting work only under the same, similar or a compatible
|
||||
license.
|
||||
|
||||
=back
|
||||
|
||||
With the understanding that:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Waiver>
|
||||
|
||||
Any of the above conditions can be waived if you get permission from
|
||||
the copyright holder.
|
||||
|
||||
=item B<Public Domain>
|
||||
|
||||
Where the work or any of its elements is in the public domain under
|
||||
applicable law, that status is in no way affected by the license.
|
||||
|
||||
=item B<Other Rights>
|
||||
|
||||
In no way are any of the following rights affected by the license:
|
||||
|
||||
=over 2
|
||||
|
||||
=item *
|
||||
|
||||
Your fair dealing or fair use rights, or other applicable
|
||||
copyright exceptions and limitations;
|
||||
|
||||
=item *
|
||||
|
||||
The author's moral rights;
|
||||
|
||||
=item *
|
||||
|
||||
Rights other persons may have either in the work itself or in
|
||||
how the work is used, such as publicity or privacy rights.
|
||||
|
||||
=back
|
||||
|
||||
=back
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Notice>
|
||||
|
||||
For any reuse or distribution, you must make clear to others the
|
||||
license terms of this work.
|
||||
|
||||
=back
|
||||
|
||||
A copy of the full license is included in the file as cc-by-sa.txt.
|
||||
|
||||
=head1 DEPENDENCIES
|
||||
|
||||
B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
|
||||
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
B<bash>(1), B<parallel>(1), B<paste>(1)
|
||||
|
||||
=cut
|
||||
|
||||
|
||||
# transpose [-d delimiter] [-b blocksize] table.csv > transposed.csv
|
||||
# cat table.csv | transpose [-d delimiter] [-b blocksize] > transposed.csv
|
||||
|
||||
|
@ -88,23 +271,24 @@ super_paste() {
|
|||
# basename
|
||||
fifo=`tempfile`
|
||||
rm $fifo
|
||||
cat > $paste_files
|
||||
# Group files from stdin in groups of 1000 files
|
||||
parallel -k -n1000 echo > $paste_files
|
||||
|
||||
# Define replacement string {0#} to 0-pad job number
|
||||
PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
|
||||
export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
|
||||
$_=sprintf("%0${f}d",seq())'\'
|
||||
|
||||
# Make fifos that can be read from
|
||||
cat $paste_files | parallel -n1000 "rm -f $fifo{0#}; mkfifo $fifo{0#}"
|
||||
cat $paste_files | parallel "rm -f $fifo{0#}; mkfifo $fifo{0#}"
|
||||
|
||||
# Start a paste process for every 1000 files
|
||||
cat $paste_files | parallel -n1000 -j0 "paste -d '$sep' {} > $fifo{0#}" &
|
||||
cat $paste_files | parallel -j0 "eval paste -d \''$sep'\' {} > $fifo{0#}" &
|
||||
|
||||
# Paste all the fifos
|
||||
eval paste -d "'$sep'" $fifo*
|
||||
|
||||
# Cleanup
|
||||
cat $paste_files | parallel -n1000 "rm -f {} $fifo{0#}"
|
||||
cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
|
||||
rm $paste_files
|
||||
}
|
||||
|
||||
|
|
|
@ -1,9 +0,0 @@
|
|||
Can it be done more simple?
|
||||
|
||||
zcat D.gz | perl -ne 's/\s+/\n/g; open(OUT,">","out".(++$out)); print OUT' ; paste out* | pigz >Dt.gz
|
||||
|
||||
Chop CSV into fields
|
||||
|
||||
multi file paste
|
||||
|
||||
paste out1 out2 | paste - out3
|
|
@ -1,175 +0,0 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
=head1 NAME
|
||||
|
||||
transpose - transpose CSV file
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
B<transpose> [-d I<delim>] [I<input>]
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
B<transpose> will read a CSV fie
|
||||
|
||||
=head1 OPTIONS
|
||||
|
||||
=over 9
|
||||
|
||||
=item I<input>
|
||||
|
||||
Input CSV file. If none is given reads from STDIN (standard input).
|
||||
|
||||
|
||||
=item B<-d> I<delim> - not implemented
|
||||
|
||||
Use I<delim> as delimiter in input and output.
|
||||
|
||||
|
||||
=back
|
||||
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
=head2 EXAMPLE: Transpose a big CSV file
|
||||
|
||||
cat medium.csv | transpose > muidem.csv
|
||||
|
||||
=head1 DESIGN
|
||||
|
||||
B<transpose> is designed to deal efficiently with medium sized data
|
||||
(up to 30 TB per file) on systems with 250 MB RAM per CPU core. It
|
||||
works by chopping the input into 30 MB blocks. Each block is
|
||||
transposed in parallel and saved to disk. Then these files are pasted
|
||||
together and finally removed.
|
||||
|
||||
=head1 REPORTING BUGS
|
||||
|
||||
Report bugs to <tange@gnu.org>.
|
||||
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Copyright (C) 2013 Ole Tange, http://ole.tange.dk and Free
|
||||
Software Foundation, Inc.
|
||||
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
Copyright (C) 2013 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
at your option any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
=head2 Documentation license I
|
||||
|
||||
Permission is granted to copy, distribute and/or modify this documentation
|
||||
under the terms of the GNU Free Documentation License, Version 1.3 or
|
||||
any later version published by the Free Software Foundation; with no
|
||||
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
|
||||
Texts. A copy of the license is included in the file fdl.txt.
|
||||
|
||||
=head2 Documentation license II
|
||||
|
||||
You are free:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<to Share>
|
||||
|
||||
to copy, distribute and transmit the work
|
||||
|
||||
=item B<to Remix>
|
||||
|
||||
to adapt the work
|
||||
|
||||
=back
|
||||
|
||||
Under the following conditions:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Attribution>
|
||||
|
||||
You must attribute the work in the manner specified by the author or
|
||||
licensor (but not in any way that suggests that they endorse you or
|
||||
your use of the work).
|
||||
|
||||
=item B<Share Alike>
|
||||
|
||||
If you alter, transform, or build upon this work, you may distribute
|
||||
the resulting work only under the same, similar or a compatible
|
||||
license.
|
||||
|
||||
=back
|
||||
|
||||
With the understanding that:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Waiver>
|
||||
|
||||
Any of the above conditions can be waived if you get permission from
|
||||
the copyright holder.
|
||||
|
||||
=item B<Public Domain>
|
||||
|
||||
Where the work or any of its elements is in the public domain under
|
||||
applicable law, that status is in no way affected by the license.
|
||||
|
||||
=item B<Other Rights>
|
||||
|
||||
In no way are any of the following rights affected by the license:
|
||||
|
||||
=over 2
|
||||
|
||||
=item *
|
||||
|
||||
Your fair dealing or fair use rights, or other applicable
|
||||
copyright exceptions and limitations;
|
||||
|
||||
=item *
|
||||
|
||||
The author's moral rights;
|
||||
|
||||
=item *
|
||||
|
||||
Rights other persons may have either in the work itself or in
|
||||
how the work is used, such as publicity or privacy rights.
|
||||
|
||||
=back
|
||||
|
||||
=back
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Notice>
|
||||
|
||||
For any reuse or distribution, you must make clear to others the
|
||||
license terms of this work.
|
||||
|
||||
=back
|
||||
|
||||
A copy of the full license is included in the file as cc-by-sa.txt.
|
||||
|
||||
=head1 DEPENDENCIES
|
||||
|
||||
B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
|
||||
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
B<bash>(1), B<parallel>(1), B<paste>(1)
|
||||
|
||||
=cut
|
||||
|
|
@ -1,80 +0,0 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
use File::Temp qw(tempfile tempdir);
|
||||
|
||||
#$Global::debug = 1;
|
||||
my $block = "30m";
|
||||
debug("parallel --pipe --block $block -k --files -j150% transpose-par.pl\n");
|
||||
my @files = `parallel --pipe --block $block -k --files -j150% transpose-par.pl`;
|
||||
chomp(@files);
|
||||
my $tmp = File::Temp::tempdir(CLEANUP => 1);
|
||||
my $fifo = "$tmp/0000000";
|
||||
my $cmd = "mkfifo $fifo; paste > $fifo ";
|
||||
my (@fifos, @args);
|
||||
my $args_len = 0;
|
||||
my $max_line_length_allowed = `parallel --max-line-length-allowed`;
|
||||
|
||||
while(@files) {
|
||||
push @args, shift @files;
|
||||
$args_len += length $args[$#args] + 1;
|
||||
if(length $cmd + $args_len > $max_line_length_allowed) {
|
||||
unshift @files, pop @args;
|
||||
push @fifos, $fifo;
|
||||
if(fork()) {
|
||||
} else {
|
||||
debug("($cmd @args &)\n");
|
||||
`($cmd @args &)`;
|
||||
exit($?);
|
||||
}
|
||||
$fifo++;
|
||||
$cmd = "mkfifo $fifo; paste > $fifo ";
|
||||
@args = ();
|
||||
$args_len = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if(@args) {
|
||||
push @fifos, $fifo;
|
||||
if(fork()) {
|
||||
} else {
|
||||
debug("($cmd @args &)\n");
|
||||
`($cmd @args &)`;
|
||||
exit($?);
|
||||
}
|
||||
}
|
||||
|
||||
# make sure all fifos are created by the spawned shells
|
||||
my @non_existing_fifos = @fifos;
|
||||
while(@non_existing_fifos) {
|
||||
if(not -e $non_existing_fifos[0]) {
|
||||
usleep(1);
|
||||
} else {
|
||||
shift @non_existing_fifos;
|
||||
}
|
||||
}
|
||||
|
||||
debug("paste @fifos\n");
|
||||
system("paste @fifos");
|
||||
|
||||
unlink(@fifos);
|
||||
rmdir($tmp);
|
||||
|
||||
sub usleep {
|
||||
# Sleep this many milliseconds.
|
||||
my $secs = shift;
|
||||
::debug(int($secs),"ms ");
|
||||
select(undef, undef, undef, $secs/1000);
|
||||
}
|
||||
|
||||
sub debug {
|
||||
# Returns: N/A
|
||||
$Global::debug or return;
|
||||
@_ = grep { defined $_ ? $_ : "" } @_;
|
||||
if($Global::fd{1}) {
|
||||
# Original stdout was saved
|
||||
my $stdout = $Global::fd{1};
|
||||
print $stdout @_;
|
||||
} else {
|
||||
print @_;
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue