572 lines
13 KiB
Bash
Executable file
572 lines
13 KiB
Bash
Executable file
#!/bin/bash
|
|
|
|
: <<'_EOS'
|
|
=pod
|
|
=cut
|
|
|
|
=head1 NAME
|
|
|
|
transpose - transpose CSV file
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
... | B<transpose> [-d I<delim>] [-b I<blocksize>]
|
|
|
|
B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input.csv>]
|
|
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
B<transpose> will read a CSV file and write the transposed version of
|
|
the file. I.e. rows will be columns, and columns will be rows.
|
|
|
|
=head1 OPTIONS
|
|
|
|
=over 9
|
|
|
|
=item I<input.csv>
|
|
|
|
Input CSV file. If none is given B<transpose> reads from STDIN
|
|
(standard input).
|
|
|
|
|
|
=item B<-d> I<delim>
|
|
|
|
Use I<delim> as delimiter in input and output. If no delimiter is
|
|
given, B<transpose> will read the first 3 rows and try to guess the
|
|
delimiter.
|
|
|
|
The autodetection does not work well if values contain a quoted
|
|
delimiter: E.g. a,"value with quoted ,",other value
|
|
|
|
|
|
=item B<-b> I<blocksize>
|
|
|
|
Pass chunks of I<blocksize> bytes to the internal transposer. Memory
|
|
usage will be 10 times I<blocksize> per CPU core. Default is 100M.
|
|
|
|
|
|
=back
|
|
|
|
|
|
=head1 EXAMPLES
|
|
|
|
=head2 EXAMPLE: Transpose a TSV file
|
|
|
|
cat normal.tsv | transpose -d '\t' > transposed.tsv
|
|
|
|
=head2 EXAMPLE: transpose 57 GB 100000x100000 CSV file
|
|
|
|
# Generate 100000x100000 matrix
|
|
100kx100k() {
|
|
XbyY() {
|
|
while seq 123456 | shuf; do true; done |
|
|
paste $(perl -e 'print map {"- "} 1..'$1) |
|
|
head -n $2
|
|
}
|
|
export -f XbyY
|
|
seq 1000 |
|
|
parallel --nice 18 --delay 0.05 --files XbyY 100000 100 |
|
|
parallel -uj1 'cat {}; nice rm {} &'
|
|
}
|
|
100kx100k > 100kx100k
|
|
# Transpose it
|
|
transpose 100kx100k > 100kx100k.t
|
|
|
|
This takes around 1 GB/core and 18 minutes to run on 64C64T.
|
|
|
|
|
|
=head1 LIMITATIONS
|
|
|
|
B<transpose> is tested on a 1000000x1000000 3.6 TB table.
|
|
|
|
There is a limit on how many filehandles can be used in super_paste.
|
|
This is probably in the order of 1000000. This limits is the number of
|
|
temporary files. By increasing the block size the number of temporary
|
|
files will be lowered. The 3.6 TB test resulted in 36000 files, so if
|
|
the limit is 1000000 files, it should work fine up to 100 TB before
|
|
you need to increase the block size.
|
|
|
|
|
|
=head1 DESIGN
|
|
|
|
B<transpose> is designed to deal efficiently with medium sized data
|
|
(up to 30 TB files) on systems with 2 GB RAM per CPU core. It works by
|
|
chopping the input into blocks (default: 100 MB). Each block is
|
|
transposed in parallel and saved to disk. Then these files are pasted
|
|
together and finally removed.
|
|
|
|
B<transpose> uses B<csvtool> if installed and a (slower) perl script
|
|
otherwise.
|
|
|
|
|
|
=head1 BUGS
|
|
|
|
B<transpose> makes files in $TMPDIR (default: /tmp). These are not
|
|
cleaned up, if B<transpose> is stopped abnormally (e.g. killed).
|
|
|
|
|
|
=head1 REPORTING BUGS
|
|
|
|
Report bugs: https://gitlab.com/ole.tange/tangetools/-/issues
|
|
|
|
|
|
=head1 AUTHOR
|
|
|
|
Copyright (C) 2013-2020 Ole Tange, http://ole.tange.dk and Free
|
|
Software Foundation, Inc.
|
|
|
|
|
|
=head1 LICENSE
|
|
|
|
Copyright (C) 2013 Free Software Foundation, Inc.
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 3 of the License, or
|
|
at your option any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
=head2 Documentation license I
|
|
|
|
Permission is granted to copy, distribute and/or modify this documentation
|
|
under the terms of the GNU Free Documentation License, Version 1.3 or
|
|
any later version published by the Free Software Foundation; with no
|
|
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
|
|
Texts. A copy of the license is included in the file fdl.txt.
|
|
|
|
=head2 Documentation license II
|
|
|
|
You are free:
|
|
|
|
=over 9
|
|
|
|
=item B<to Share>
|
|
|
|
to copy, distribute and transmit the work
|
|
|
|
=item B<to Remix>
|
|
|
|
to adapt the work
|
|
|
|
=back
|
|
|
|
Under the following conditions:
|
|
|
|
=over 9
|
|
|
|
=item B<Attribution>
|
|
|
|
You must attribute the work in the manner specified by the author or
|
|
licensor (but not in any way that suggests that they endorse you or
|
|
your use of the work).
|
|
|
|
=item B<Share Alike>
|
|
|
|
If you alter, transform, or build upon this work, you may distribute
|
|
the resulting work only under the same, similar or a compatible
|
|
license.
|
|
|
|
=back
|
|
|
|
With the understanding that:
|
|
|
|
=over 9
|
|
|
|
=item B<Waiver>
|
|
|
|
Any of the above conditions can be waived if you get permission from
|
|
the copyright holder.
|
|
|
|
=item B<Public Domain>
|
|
|
|
Where the work or any of its elements is in the public domain under
|
|
applicable law, that status is in no way affected by the license.
|
|
|
|
=item B<Other Rights>
|
|
|
|
In no way are any of the following rights affected by the license:
|
|
|
|
=over 2
|
|
|
|
=item *
|
|
|
|
Your fair dealing or fair use rights, or other applicable
|
|
copyright exceptions and limitations;
|
|
|
|
=item *
|
|
|
|
The author's moral rights;
|
|
|
|
=item *
|
|
|
|
Rights other persons may have either in the work itself or in
|
|
how the work is used, such as publicity or privacy rights.
|
|
|
|
=back
|
|
|
|
=back
|
|
|
|
=over 9
|
|
|
|
=item B<Notice>
|
|
|
|
For any reuse or distribution, you must make clear to others the
|
|
license terms of this work.
|
|
|
|
=back
|
|
|
|
A copy of the full license is included in the file as cc-by-sa.txt.
|
|
|
|
=head1 DEPENDENCIES
|
|
|
|
B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
|
|
|
|
|
|
=head1 SEE ALSO
|
|
|
|
B<csvtool>(1), B<bash>(1), B<parallel>(1), B<paste>(1)
|
|
|
|
=cut
|
|
_EOS
|
|
#'
|
|
|
|
# Timings: 100kx200k (114GB) 34min
|
|
# 200kx200k (228GB) 63min
|
|
|
|
transpose_perl() {
|
|
# Simple in-memory transpose
|
|
# Standard input:
|
|
# data to be transposed
|
|
# Standard output:
|
|
# transposed data
|
|
sep="$1"
|
|
shift
|
|
|
|
if [ "$sep" == '\s+' ] ; then
|
|
# Multiple spaces = separator
|
|
space_merger() { tr ' ' ' ' | tr -s ' '; }
|
|
sep=" "
|
|
else
|
|
space_merger() { cat; }
|
|
fi
|
|
space_merger | perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
|
|
#!/usr/bin/perl
|
|
|
|
use Text::CSV;
|
|
use Getopt::Long;
|
|
|
|
Getopt::Long::Configure("bundling","require_order");
|
|
my $retval = GetOptions("debug|D=s" => \$opt::debug,
|
|
"delimiter|d=s" => \$opt::delimiter,
|
|
"verbose|v" => \@opt::verbose,
|
|
"simple|s" => \$opt::simple,
|
|
);
|
|
|
|
if(defined $opt::delimiter) {
|
|
simple();
|
|
} else {
|
|
die("-d must be set");
|
|
}
|
|
|
|
sub simple {
|
|
my (@table);
|
|
my $col = 0;
|
|
my $csv_setting = { binary => 1, sep_char => $opt::delimiter };
|
|
my $sep = $csv_setting->{sep_char};
|
|
my $csv = Text::CSV->new($csv_setting)
|
|
or die "Cannot use CSV: ".Text::CSV->error_diag ();
|
|
|
|
while(my $l = <>) {
|
|
if(not $csv->parse($l)) {
|
|
die "CSV has unexpected format";
|
|
}
|
|
# append to each row
|
|
my $row = 0;
|
|
for($csv->fields()) {
|
|
$table[$row][$col] = defined($_) ? $_ : '';
|
|
$row++;
|
|
}
|
|
$col++;
|
|
}
|
|
print map { join($sep,@$_),"\n" } @table;
|
|
}
|
|
cut-here-UbsAqi0j6GoOuk5W5yWA
|
|
) -d "$sep" "$@"
|
|
}
|
|
export -f transpose_perl
|
|
|
|
transpose_csvtool() {
|
|
# Use cvstool to transpose
|
|
# Standard input:
|
|
# data to be transposed
|
|
# Standard output:
|
|
# transposed data
|
|
sep="$1"
|
|
if [ "$sep" == "\s+" ] ; then
|
|
# Multiple spaces = separator
|
|
tr ' ' ' ' | tr -s ' ' |
|
|
csvtool transpose -t " " -u " " -
|
|
else
|
|
csvtool transpose -t "$sep" -u "$sep" -
|
|
fi
|
|
}
|
|
export -f transpose_csvtool
|
|
|
|
detect_transposer() {
|
|
# Find the fastest transpose tool installed
|
|
if bash -c 'echo 1 | transpose_csvtool ,' >/dev/null 2>&1; then
|
|
echo transpose_csvtool
|
|
else
|
|
if bash -c 'echo 1 | transpose_perl ,' >/dev/null 2>&1; then
|
|
echo transpose_perl
|
|
else
|
|
echo Error: You need the Perl Text::CSV module or csvtool. >&2
|
|
fi
|
|
fi
|
|
}
|
|
|
|
make_paste_files() {
|
|
# Transpose input in blocks
|
|
# Output:
|
|
# each transposed block as file name
|
|
block_size="$1"
|
|
sep="$2"
|
|
file="$3"
|
|
transposer=$(detect_transposer)
|
|
par_opt="-k --files --block $block_size"
|
|
if [ -z "$file" ]; then
|
|
parallel $par_opt --pipe "$transposer" "'$sep'"
|
|
else
|
|
parallel $par_opt --pipe-part -a "$file" "$transposer" "'$sep'"
|
|
fi
|
|
}
|
|
|
|
super_paste() {
|
|
# Like 'paste' up to 1000000 files
|
|
# More than 250000 files requires extra filehandles for GNU Parallel
|
|
# The files are read from stdin
|
|
cleanup() {
|
|
printf "\rSIGINT caught \n" >&2
|
|
(rm -rf "$TMPDIR" &)&
|
|
exit 1
|
|
}
|
|
|
|
trap 'cleanup' SIGINT
|
|
|
|
sep="$1"
|
|
paste_files=$(tempfile)
|
|
# basename
|
|
fifo=$(tempfile)
|
|
rm "$fifo"
|
|
# Group files from stdin in groups of 1000 files
|
|
parallel -k -n1000 echo > "$paste_files"
|
|
|
|
# Define replacement string {0#} to 0-pad job number
|
|
export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
|
|
$_=sprintf("%0${f}d",seq())'\'
|
|
|
|
# Make fifos that can be read from
|
|
cat "$paste_files" | parallel "rm -f '$fifo'{0#}; mkfifo '$fifo'{0#}"
|
|
|
|
# Start a paste process for every 1000 files
|
|
cat "$paste_files" | parallel -j0 "eval paste -d \''$sep'\' {} > '$fifo'{0#}" &
|
|
|
|
# Paste all the fifos
|
|
eval paste -d "'$sep'" "$fifo"*
|
|
|
|
# Cleanup
|
|
cat "$paste_files" | parallel "eval rm -f {} '$fifo'{0#}"
|
|
rm "$paste_files"
|
|
}
|
|
|
|
stdin_detect_sep() {
|
|
# Read the first 3 lines of stdin and detect the separator
|
|
# Only , space tab ; | : \0 and whitespace are detected
|
|
# Save the 3 lines input to file so it can be read again later
|
|
perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
|
|
#!/usr/bin/perl
|
|
|
|
sub max(@) {
|
|
# Returns:
|
|
# Maximum value of array
|
|
my $max;
|
|
for (@_) {
|
|
# Skip undefs
|
|
defined $_ or next;
|
|
defined $max or do { $max = $_; next; }; # Set $_ to the first non-undef
|
|
$max = ($max > $_) ? $max : $_;
|
|
}
|
|
return $max;
|
|
}
|
|
|
|
sub find_sep(@) {
|
|
# Try common find the separators.
|
|
# Do we get the same for each line?
|
|
my @csv = grep { not /^#/ } @_;
|
|
my @sep = (",", "\t", ";", ' ', '\|', ':', "\0", '\s+');
|
|
my $columns;
|
|
my %col;
|
|
for my $sep (@sep) {
|
|
for my $line (@csv) {
|
|
$columns = split /$sep/, $line;
|
|
if($columns > 1) {
|
|
$col{$sep."\0".$columns}++
|
|
}
|
|
}
|
|
}
|
|
# Find max $col{$sep,$columns}
|
|
my $most_lines = max(values %col);
|
|
|
|
my %sepcol = (map { split /\0/, $_ }
|
|
grep { $col{$_} == $most_lines } keys %col);
|
|
my $most_cols = max(values %sepcol);
|
|
return ((grep { $sepcol{$_} == $most_cols } keys %sepcol)[0]);
|
|
}
|
|
|
|
my $buf = "";
|
|
my $newlines = 0;
|
|
open(OUT, "+>", shift) || die;
|
|
# Copy (at least) 3 lines to OUT
|
|
while(sysread(STDIN,$buf,131072)) {
|
|
print OUT $buf;
|
|
$newlines += $buf =~ tr/\n/\n/;
|
|
if($newlines >= 3) {
|
|
last;
|
|
}
|
|
}
|
|
seek(OUT,0,0) || die;
|
|
my @lines = <OUT>;
|
|
close OUT;
|
|
# Remove last half-line
|
|
pop @lines;
|
|
print find_sep(@lines);
|
|
|
|
cut-here-UbsAqi0j6GoOuk5W5yWA
|
|
) "$@"
|
|
}
|
|
|
|
matrix() {
|
|
# Generate table X by Y
|
|
row="$1"
|
|
col="$2"
|
|
sep="$3"
|
|
|
|
mxn() { perl -E 'for(1..'$1') { say join "'$3'", map {int(rand()*123456)} 1..'$2' } '; }
|
|
export -f mxn
|
|
thr=$(parallel --number-of-threads)
|
|
prow=$((row/thr/100))
|
|
seq $((thr*100)) | parallel mxn $prow $col $sep
|
|
mxn $((row-prow*thr*100)) $col $sep
|
|
}
|
|
|
|
demo() {
|
|
# Generate table X by Y
|
|
row="$1"
|
|
col="$2"
|
|
sep="$3"
|
|
|
|
# Generate string "- - - - " for each col
|
|
paste_format_string=$(perl -e 'print map {"- "} 1..'$col)
|
|
# Generate dummy values
|
|
while seq 123456; do true; done |
|
|
# Use paste's format string to make $col columns
|
|
paste -d "$sep" $paste_format_string |
|
|
# Keep the top $row rows
|
|
head -n "$row"
|
|
}
|
|
|
|
usage() {
|
|
echo "Usage: $0 [-d delimiter] [-b blocksize]" 1>&2; exit 1;
|
|
}
|
|
|
|
version() {
|
|
cat <<EOF
|
|
transpose 20201130
|
|
Copyright (C) 2020 Ole Tange, http://ole.tange.dk
|
|
License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>
|
|
This is free software: you are free to change and redistribute it.
|
|
transpose comes with no warranty.
|
|
|
|
Web site: https://gitlab.com/ole.tange/tangetools/-/tree/master/transpose
|
|
EOF
|
|
}
|
|
|
|
main() {
|
|
block_size=100M
|
|
while getopts ":b:d:V" o; do
|
|
case "$o" in
|
|
(d)
|
|
# Convert \t to TAB using printf
|
|
d=$(printf "$OPTARG")
|
|
if [ "'" = "$d" ] ; then
|
|
echo "Delimiter cannot be '"
|
|
usage
|
|
exit 0
|
|
fi
|
|
;;
|
|
(b)
|
|
block_size="$OPTARG"
|
|
;;
|
|
(V)
|
|
version
|
|
exit 0
|
|
;;
|
|
(*)
|
|
usage
|
|
;;
|
|
esac
|
|
done
|
|
shift $((OPTIND-1))
|
|
file="$1"
|
|
sep="$d"
|
|
# Put all tempfiles into a single dir
|
|
export TMPDIR=`mktemp -d`
|
|
first_lines=$(tempfile)
|
|
if [ -z "$file" ]; then
|
|
if [ -z "$sep" ] ; then
|
|
sep=$(stdin_detect_sep $first_lines)
|
|
if [ -z "$sep" ] ; then
|
|
echo "transpose: Cannot autodetect separator. Use -d" >&2
|
|
exit 1
|
|
fi
|
|
fi
|
|
if [ "$sep" == '\s+' ] ; then
|
|
# Multiple spaces = separator
|
|
osep=" "
|
|
else
|
|
osep="$sep"
|
|
fi
|
|
# Prepend stdin with the lines read in stdin_detect_sep
|
|
(cat "$first_lines"; rm "$first_lines"; cat) |
|
|
make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
|
|
else
|
|
if [ -z "$sep" ] ; then
|
|
sep=$(stdin_detect_sep < "$file" "$first_lines")
|
|
if [ -z "$sep" ] ; then
|
|
echo "transpose: Cannot autodetect separator. Use -d" >&2
|
|
exit 1
|
|
fi
|
|
fi
|
|
rm "$first_lines"
|
|
if [ "$sep" == '\s+' ] ; then
|
|
# Multiple spaces = separator
|
|
osep=" "
|
|
else
|
|
osep="$sep"
|
|
fi
|
|
make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
|
|
fi
|
|
rmdir "$TMPDIR" 2>/dev/null
|
|
}
|
|
|
|
# Make sure the whole file is read before starting
|
|
main "$@"
|