tangetools/transpose/transpose

572 lines
13 KiB
Bash
Executable file

#!/bin/bash
: <<'_EOS'
=pod
=cut
=head1 NAME
transpose - transpose CSV file
=head1 SYNOPSIS
... | B<transpose> [-d I<delim>] [-b I<blocksize>]
B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input.csv>]
=head1 DESCRIPTION
B<transpose> will read a CSV file and write the transposed version of
the file. I.e. rows will be columns, and columns will be rows.
=head1 OPTIONS
=over 9
=item I<input.csv>
Input CSV file. If none is given B<transpose> reads from STDIN
(standard input).
=item B<-d> I<delim>
Use I<delim> as delimiter in input and output. If no delimiter is
given, B<transpose> will read the first 3 rows and try to guess the
delimiter.
The autodetection does not work well if values contain a quoted
delimiter: E.g. a,"value with quoted ,",other value
=item B<-b> I<blocksize>
Pass chunks of I<blocksize> bytes to the internal transposer. Memory
usage will be 10 times I<blocksize> per CPU core. Default is 100M.
=back
=head1 EXAMPLES
=head2 EXAMPLE: Transpose a TSV file
cat normal.tsv | transpose -d '\t' > transposed.tsv
=head2 EXAMPLE: transpose 57 GB 100000x100000 CSV file
# Generate 100000x100000 matrix
100kx100k() {
XbyY() {
while seq 123456 | shuf; do true; done |
paste $(perl -e 'print map {"- "} 1..'$1) |
head -n $2
}
export -f XbyY
seq 1000 |
parallel --nice 18 --delay 0.05 --files XbyY 100000 100 |
parallel -uj1 'cat {}; nice rm {} &'
}
100kx100k > 100kx100k
# Transpose it
transpose 100kx100k > 100kx100k.t
This takes around 1 GB/core and 18 minutes to run on 64C64T.
=head1 LIMITATIONS
B<transpose> is tested on a 1000000x1000000 3.6 TB table.
There is a limit on how many filehandles can be used in super_paste.
This is probably in the order of 1000000. This limits is the number of
temporary files. By increasing the block size the number of temporary
files will be lowered. The 3.6 TB test resulted in 36000 files, so if
the limit is 1000000 files, it should work fine up to 100 TB before
you need to increase the block size.
=head1 DESIGN
B<transpose> is designed to deal efficiently with medium sized data
(up to 30 TB files) on systems with 2 GB RAM per CPU core. It works by
chopping the input into blocks (default: 100 MB). Each block is
transposed in parallel and saved to disk. Then these files are pasted
together and finally removed.
B<transpose> uses B<csvtool> if installed and a (slower) perl script
otherwise.
=head1 BUGS
B<transpose> makes files in $TMPDIR (default: /tmp). These are not
cleaned up, if B<transpose> is stopped abnormally (e.g. killed).
=head1 REPORTING BUGS
Report bugs: https://gitlab.com/ole.tange/tangetools/-/issues
=head1 AUTHOR
Copyright (C) 2013-2020 Ole Tange, http://ole.tange.dk and Free
Software Foundation, Inc.
=head1 LICENSE
Copyright (C) 2013 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head2 Documentation license I
Permission is granted to copy, distribute and/or modify this documentation
under the terms of the GNU Free Documentation License, Version 1.3 or
any later version published by the Free Software Foundation; with no
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
Texts. A copy of the license is included in the file fdl.txt.
=head2 Documentation license II
You are free:
=over 9
=item B<to Share>
to copy, distribute and transmit the work
=item B<to Remix>
to adapt the work
=back
Under the following conditions:
=over 9
=item B<Attribution>
You must attribute the work in the manner specified by the author or
licensor (but not in any way that suggests that they endorse you or
your use of the work).
=item B<Share Alike>
If you alter, transform, or build upon this work, you may distribute
the resulting work only under the same, similar or a compatible
license.
=back
With the understanding that:
=over 9
=item B<Waiver>
Any of the above conditions can be waived if you get permission from
the copyright holder.
=item B<Public Domain>
Where the work or any of its elements is in the public domain under
applicable law, that status is in no way affected by the license.
=item B<Other Rights>
In no way are any of the following rights affected by the license:
=over 2
=item *
Your fair dealing or fair use rights, or other applicable
copyright exceptions and limitations;
=item *
The author's moral rights;
=item *
Rights other persons may have either in the work itself or in
how the work is used, such as publicity or privacy rights.
=back
=back
=over 9
=item B<Notice>
For any reuse or distribution, you must make clear to others the
license terms of this work.
=back
A copy of the full license is included in the file as cc-by-sa.txt.
=head1 DEPENDENCIES
B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
=head1 SEE ALSO
B<csvtool>(1), B<bash>(1), B<parallel>(1), B<paste>(1)
=cut
_EOS
#'
# Timings: 100kx200k (114GB) 34min
# 200kx200k (228GB) 63min
transpose_perl() {
# Simple in-memory transpose
# Standard input:
# data to be transposed
# Standard output:
# transposed data
sep="$1"
shift
if [ "$sep" == '\s+' ] ; then
# Multiple spaces = separator
space_merger() { tr ' ' ' ' | tr -s ' '; }
sep=" "
else
space_merger() { cat; }
fi
space_merger | perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
#!/usr/bin/perl
use Text::CSV;
use Getopt::Long;
Getopt::Long::Configure("bundling","require_order");
my $retval = GetOptions("debug|D=s" => \$opt::debug,
"delimiter|d=s" => \$opt::delimiter,
"verbose|v" => \@opt::verbose,
"simple|s" => \$opt::simple,
);
if(defined $opt::delimiter) {
simple();
} else {
die("-d must be set");
}
sub simple {
my (@table);
my $col = 0;
my $csv_setting = { binary => 1, sep_char => $opt::delimiter };
my $sep = $csv_setting->{sep_char};
my $csv = Text::CSV->new($csv_setting)
or die "Cannot use CSV: ".Text::CSV->error_diag ();
while(my $l = <>) {
if(not $csv->parse($l)) {
die "CSV has unexpected format";
}
# append to each row
my $row = 0;
for($csv->fields()) {
$table[$row][$col] = defined($_) ? $_ : '';
$row++;
}
$col++;
}
print map { join($sep,@$_),"\n" } @table;
}
cut-here-UbsAqi0j6GoOuk5W5yWA
) -d "$sep" "$@"
}
export -f transpose_perl
transpose_csvtool() {
# Use cvstool to transpose
# Standard input:
# data to be transposed
# Standard output:
# transposed data
sep="$1"
if [ "$sep" == "\s+" ] ; then
# Multiple spaces = separator
tr ' ' ' ' | tr -s ' ' |
csvtool transpose -t " " -u " " -
else
csvtool transpose -t "$sep" -u "$sep" -
fi
}
export -f transpose_csvtool
detect_transposer() {
# Find the fastest transpose tool installed
if bash -c 'echo 1 | transpose_csvtool ,' >/dev/null 2>&1; then
echo transpose_csvtool
else
if bash -c 'echo 1 | transpose_perl ,' >/dev/null 2>&1; then
echo transpose_perl
else
echo Error: You need the Perl Text::CSV module or csvtool. >&2
fi
fi
}
make_paste_files() {
# Transpose input in blocks
# Output:
# each transposed block as file name
block_size="$1"
sep="$2"
file="$3"
transposer=$(detect_transposer)
par_opt="-k --files --block $block_size"
if [ -z "$file" ]; then
parallel $par_opt --pipe "$transposer" "'$sep'"
else
parallel $par_opt --pipe-part -a "$file" "$transposer" "'$sep'"
fi
}
super_paste() {
# Like 'paste' up to 1000000 files
# More than 250000 files requires extra filehandles for GNU Parallel
# The files are read from stdin
cleanup() {
printf "\rSIGINT caught \n" >&2
(rm -rf "$TMPDIR" &)&
exit 1
}
trap 'cleanup' SIGINT
sep="$1"
paste_files=$(tempfile)
# basename
fifo=$(tempfile)
rm "$fifo"
# Group files from stdin in groups of 1000 files
parallel -k -n1000 echo > "$paste_files"
# Define replacement string {0#} to 0-pad job number
export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
$_=sprintf("%0${f}d",seq())'\'
# Make fifos that can be read from
cat "$paste_files" | parallel "rm -f '$fifo'{0#}; mkfifo '$fifo'{0#}"
# Start a paste process for every 1000 files
cat "$paste_files" | parallel -j0 "eval paste -d \''$sep'\' {} > '$fifo'{0#}" &
# Paste all the fifos
eval paste -d "'$sep'" "$fifo"*
# Cleanup
cat "$paste_files" | parallel "eval rm -f {} '$fifo'{0#}"
rm "$paste_files"
}
stdin_detect_sep() {
# Read the first 3 lines of stdin and detect the separator
# Only , space tab ; | : \0 and whitespace are detected
# Save the 3 lines input to file so it can be read again later
perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
#!/usr/bin/perl
sub max(@) {
# Returns:
# Maximum value of array
my $max;
for (@_) {
# Skip undefs
defined $_ or next;
defined $max or do { $max = $_; next; }; # Set $_ to the first non-undef
$max = ($max > $_) ? $max : $_;
}
return $max;
}
sub find_sep(@) {
# Try common find the separators.
# Do we get the same for each line?
my @csv = grep { not /^#/ } @_;
my @sep = (",", "\t", ";", ' ', '\|', ':', "\0", '\s+');
my $columns;
my %col;
for my $sep (@sep) {
for my $line (@csv) {
$columns = split /$sep/, $line;
if($columns > 1) {
$col{$sep."\0".$columns}++
}
}
}
# Find max $col{$sep,$columns}
my $most_lines = max(values %col);
my %sepcol = (map { split /\0/, $_ }
grep { $col{$_} == $most_lines } keys %col);
my $most_cols = max(values %sepcol);
return ((grep { $sepcol{$_} == $most_cols } keys %sepcol)[0]);
}
my $buf = "";
my $newlines = 0;
open(OUT, "+>", shift) || die;
# Copy (at least) 3 lines to OUT
while(sysread(STDIN,$buf,131072)) {
print OUT $buf;
$newlines += $buf =~ tr/\n/\n/;
if($newlines >= 3) {
last;
}
}
seek(OUT,0,0) || die;
my @lines = <OUT>;
close OUT;
# Remove last half-line
pop @lines;
print find_sep(@lines);
cut-here-UbsAqi0j6GoOuk5W5yWA
) "$@"
}
matrix() {
# Generate table X by Y
row="$1"
col="$2"
sep="$3"
mxn() { perl -E 'for(1..'$1') { say join "'$3'", map {int(rand()*123456)} 1..'$2' } '; }
export -f mxn
thr=$(parallel --number-of-threads)
prow=$((row/thr/100))
seq $((thr*100)) | parallel mxn $prow $col $sep
mxn $((row-prow*thr*100)) $col $sep
}
demo() {
# Generate table X by Y
row="$1"
col="$2"
sep="$3"
# Generate string "- - - - " for each col
paste_format_string=$(perl -e 'print map {"- "} 1..'$col)
# Generate dummy values
while seq 123456; do true; done |
# Use paste's format string to make $col columns
paste -d "$sep" $paste_format_string |
# Keep the top $row rows
head -n "$row"
}
usage() {
echo "Usage: $0 [-d delimiter] [-b blocksize]" 1>&2; exit 1;
}
version() {
cat <<EOF
transpose 20201130
Copyright (C) 2020 Ole Tange, http://ole.tange.dk
License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
transpose comes with no warranty.
Web site: https://gitlab.com/ole.tange/tangetools/-/tree/master/transpose
EOF
}
main() {
block_size=100M
while getopts ":b:d:V" o; do
case "$o" in
(d)
# Convert \t to TAB using printf
d=$(printf "$OPTARG")
if [ "'" = "$d" ] ; then
echo "Delimiter cannot be '"
usage
exit 0
fi
;;
(b)
block_size="$OPTARG"
;;
(V)
version
exit 0
;;
(*)
usage
;;
esac
done
shift $((OPTIND-1))
file="$1"
sep="$d"
# Put all tempfiles into a single dir
export TMPDIR=`mktemp -d`
first_lines=$(tempfile)
if [ -z "$file" ]; then
if [ -z "$sep" ] ; then
sep=$(stdin_detect_sep $first_lines)
if [ -z "$sep" ] ; then
echo "transpose: Cannot autodetect separator. Use -d" >&2
exit 1
fi
fi
if [ "$sep" == '\s+' ] ; then
# Multiple spaces = separator
osep=" "
else
osep="$sep"
fi
# Prepend stdin with the lines read in stdin_detect_sep
(cat "$first_lines"; rm "$first_lines"; cat) |
make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
else
if [ -z "$sep" ] ; then
sep=$(stdin_detect_sep < "$file" "$first_lines")
if [ -z "$sep" ] ; then
echo "transpose: Cannot autodetect separator. Use -d" >&2
exit 1
fi
fi
rm "$first_lines"
if [ "$sep" == '\s+' ] ; then
# Multiple spaces = separator
osep=" "
else
osep="$sep"
fi
make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
fi
rmdir "$TMPDIR" 2>/dev/null
}
# Make sure the whole file is read before starting
main "$@"