tangetools/transpose/transpose

#!/bin/bash

: <<'_EOS'
=pod
=cut

=head1 NAME

transpose - transpose CSV file

=head1 SYNOPSIS

... | B<transpose> [-d I<delim>] [-b I<blocksize>]

B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input.csv>]


=head1 DESCRIPTION

B<transpose> will read a CSV file and write the transposed version of
the file. I.e. rows will be columns, and columns will be rows.

=head1 OPTIONS

=over 9

=item I<input.csv>

Input CSV file. If none is given B<transpose> reads from STDIN
(standard input).


=item B<-d> I<delim>

Use I<delim> as delimiter in input and output. If no delimiter is
given, B<transpose> will read the first 3 rows and try to guess the
delimiter.

The autodetection does not work well if values contain a quoted
delimiter: E.g. a,"value with quoted ,",other value


=item B<-b> I<blocksize>

Pass chunks of I<blocksize> bytes to the internal transposer. Memory
usage will be 10 times I<blocksize> per CPU core. Default is 100M.


=back


=head1 EXAMPLES

=head2 EXAMPLE: Transpose a TSV file

    cat normal.tsv | transpose -d '\t' > transposed.tsv

=head2 EXAMPLE: transpose 57 GB 100000x100000 CSV file

    # Generate 100000x100000 matrix
    100kx100k() {
        XbyY() {
            while seq 123456 | shuf; do true; done |
                paste $(perl -e 'print map {"- "} 1..'$1) |
                head -n $2
        }
        export -f XbyY
        seq 1000 |
          parallel --nice 18 --delay 0.05 --files XbyY 100000 100 |
          parallel -uj1 'cat {}; nice rm {} &'
    }
    100kx100k > 100kx100k
    # Transpose it
    transpose 100kx100k > 100kx100k.t

This takes around 1 GB/core and 18 minutes to run on 64C64T.


=head1 LIMITATIONS

B<transpose> is tested on a 1000000x1000000 3.6 TB table.

There is a limit on how many filehandles can be used in super_paste.
This is probably in the order of 1000000. This limits is the number of
temporary files. By increasing the block size the number of temporary
files will be lowered. The 3.6 TB test resulted in 36000 files, so if
the limit is 1000000 files, it should work fine up to 100 TB before
you need to increase the block size.


=head1 DESIGN

B<transpose> is designed to deal efficiently with medium sized data
(up to 30 TB files) on systems with 2 GB RAM per CPU core. It works by
chopping the input into blocks (default: 100 MB). Each block is
transposed in parallel and saved to disk. Then these files are pasted
together and finally removed.

B<transpose> uses B<csvtool> if installed and a (slower) perl script
otherwise.


=head1 BUGS

B<transpose> makes files in $TMPDIR (default: /tmp). These are not
cleaned up, if B<transpose> is stopped abnormally (e.g. killed).


=head1 REPORTING BUGS

Report bugs: https://gitlab.com/ole.tange/tangetools/-/issues


=head1 AUTHOR

Copyright (C) 2013-2020 Ole Tange, http://ole.tange.dk and Free
Software Foundation, Inc.


=head1 LICENSE

Copyright (C) 2013 Free Software Foundation, Inc.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

=head2 Documentation license I

Permission is granted to copy, distribute and/or modify this documentation
under the terms of the GNU Free Documentation License, Version 1.3 or
any later version published by the Free Software Foundation; with no
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
Texts.  A copy of the license is included in the file fdl.txt.

=head2 Documentation license II

You are free:

=over 9

=item B<to Share>

to copy, distribute and transmit the work

=item B<to Remix>

to adapt the work

=back

Under the following conditions:

=over 9

=item B<Attribution>

You must attribute the work in the manner specified by the author or
licensor (but not in any way that suggests that they endorse you or
your use of the work).

=item B<Share Alike>

If you alter, transform, or build upon this work, you may distribute
the resulting work only under the same, similar or a compatible
license.

=back

With the understanding that:

=over 9

=item B<Waiver>

Any of the above conditions can be waived if you get permission from
the copyright holder.

=item B<Public Domain>

Where the work or any of its elements is in the public domain under
applicable law, that status is in no way affected by the license.

=item B<Other Rights>

In no way are any of the following rights affected by the license:

=over 2

=item *

Your fair dealing or fair use rights, or other applicable
copyright exceptions and limitations;

=item *

The author's moral rights;

=item *

Rights other persons may have either in the work itself or in
how the work is used, such as publicity or privacy rights.

=back

=back

=over 9

=item B<Notice>

For any reuse or distribution, you must make clear to others the
license terms of this work.

=back

A copy of the full license is included in the file as cc-by-sa.txt.

=head1 DEPENDENCIES

B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.


=head1 SEE ALSO

B<csvtool>(1), B<bash>(1), B<parallel>(1), B<paste>(1)

=cut
_EOS
#'

# Timings: 100kx200k (114GB) 34min
# 200kx200k (228GB) 63min

transpose_perl() {
    # Simple in-memory transpose
    # Standard input:
    #   data to be transposed
    # Standard output:
    #   transposed data
    sep="$1"
    shift

    if [ "$sep" == '\s+' ] ; then
	# Multiple spaces = separator
	space_merger() { tr '	 ' ' ' | tr -s ' '; }
	sep=" "
    else
	space_merger() { cat; }
    fi
    space_merger | perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
#!/usr/bin/perl

use Text::CSV;
use Getopt::Long;

Getopt::Long::Configure("bundling","require_order");
my $retval = GetOptions("debug|D=s" => \$opt::debug,
			"delimiter|d=s" => \$opt::delimiter,
			"verbose|v" => \@opt::verbose,
			"simple|s" => \$opt::simple,
    );

if(defined $opt::delimiter) {
    simple();
} else {
    die("-d must be set");
}

sub simple {
    my (@table);
    my $col = 0;
    my $csv_setting = { binary => 1, sep_char => $opt::delimiter };
    my $sep = $csv_setting->{sep_char};
    my $csv = Text::CSV->new($csv_setting)
	or die "Cannot use CSV: ".Text::CSV->error_diag ();

    while(my $l = <>) {
	if(not $csv->parse($l)) {
	    die "CSV has unexpected format";
	}
	# append to each row
	my $row = 0;
	for($csv->fields()) {
	    $table[$row][$col] = defined($_) ? $_ : '';
	    $row++;
	}
	$col++;
    }
    print map { join($sep,@$_),"\n" } @table;
}
cut-here-UbsAqi0j6GoOuk5W5yWA
	  ) -d "$sep" "$@"
}
export -f transpose_perl

transpose_csvtool() {
    # Use cvstool to transpose
    # Standard input:
    #   data to be transposed
    # Standard output:
    #   transposed data
    sep="$1"
    if [ "$sep" == "\s+" ] ; then
	# Multiple spaces = separator
	tr '	 ' ' ' | tr -s ' ' |
	    csvtool transpose -t " " -u " " -
    else
	csvtool transpose -t "$sep" -u "$sep" -
    fi
}
export -f transpose_csvtool

detect_transposer() {
    # Find the fastest transpose tool installed
    if bash -c 'echo 1 | transpose_csvtool ,' >/dev/null 2>&1; then
	echo transpose_csvtool
    else
	if bash -c 'echo 1 | transpose_perl ,' >/dev/null 2>&1; then
	    echo transpose_perl
	else
	    echo Error: You need the Perl Text::CSV module or csvtool. >&2
	fi
    fi
}

make_paste_files() {
    # Transpose input in blocks
    # Output:
    #   each transposed block as file name
    block_size="$1"
    sep="$2"
    file="$3"
    transposer=$(detect_transposer)
    par_opt="-k --files --block $block_size"
    if [ -z "$file" ]; then
	parallel $par_opt --pipe "$transposer" "'$sep'"
    else
	parallel $par_opt --pipe-part -a "$file" "$transposer" "'$sep'"
    fi
}

super_paste() {
    # Like 'paste' up to 1000000 files
    # More than 250000 files requires extra filehandles for GNU Parallel
    # The files are read from stdin
    cleanup() {
	printf "\rSIGINT caught      \n" >&2
	(rm -rf "$TMPDIR" &)&
	exit 1
    }

    trap 'cleanup' SIGINT

    sep="$1"
    paste_files=$(tempfile)
    # basename
    fifo=$(tempfile)
    rm "$fifo"
    # Group files from stdin in groups of 1000 files
    parallel -k -n1000 echo > "$paste_files"

    # Define replacement string {0#} to 0-pad job number
    export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
                    $_=sprintf("%0${f}d",seq())'\'

    # Make fifos that can be read from
    cat "$paste_files" | parallel "rm -f '$fifo'{0#}; mkfifo '$fifo'{0#}"

    # Start a paste process for every 1000 files
    cat "$paste_files" | parallel -j0 "eval paste -d \''$sep'\' {} > '$fifo'{0#}" &

    # Paste all the fifos
    eval paste -d "'$sep'" "$fifo"*

    # Cleanup
    cat "$paste_files" | parallel "eval rm -f {} '$fifo'{0#}"
    rm "$paste_files"
}

stdin_detect_sep() {
    # Read the first 3 lines of stdin and detect the separator
    # Only , space tab ; | :  \0 and whitespace are detected
    # Save the 3 lines input to file so it can be read again later
    perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
#!/usr/bin/perl

sub max(@) {
    # Returns:
    #   Maximum value of array
    my $max;
    for (@_) {
        # Skip undefs
        defined $_ or next;
        defined $max or do { $max = $_; next; }; # Set $_ to the first non-undef
        $max = ($max > $_) ? $max : $_;
    }
    return $max;
}

sub find_sep(@) {
    # Try common find the separators.
    # Do we get the same for each line?
    my @csv = grep { not /^#/ } @_;
    my @sep = (",", "\t", ";", ' ', '\|', ':', "\0", '\s+');
    my $columns;
    my %col;
    for my $sep (@sep) {
	for my $line (@csv) {
	    $columns = split /$sep/, $line;
	    if($columns > 1) {
		$col{$sep."\0".$columns}++
	    }
	}
    }
    # Find max $col{$sep,$columns}
    my $most_lines = max(values %col);

    my %sepcol = (map { split /\0/, $_ }
		  grep { $col{$_} == $most_lines } keys %col);
    my $most_cols = max(values %sepcol);
    return ((grep { $sepcol{$_} == $most_cols } keys %sepcol)[0]);
}

my $buf = "";
my $newlines = 0;
open(OUT, "+>", shift) || die;
# Copy (at least) 3 lines to OUT
while(sysread(STDIN,$buf,131072)) {
    print OUT $buf;
    $newlines += $buf =~ tr/\n/\n/;
    if($newlines >= 3) {
	last;
    }
}
seek(OUT,0,0) || die;
my @lines = <OUT>;
close OUT;
# Remove last half-line
pop @lines;
print find_sep(@lines);

cut-here-UbsAqi0j6GoOuk5W5yWA
	  ) "$@"
}

matrix() {
    # Generate table X by Y
    row="$1"
    col="$2"
    sep="$3"

    mxn() { perl -E 'for(1..'$1') { say join "'$3'", map {int(rand()*123456)} 1..'$2' } '; }
    export -f mxn
    thr=$(parallel --number-of-threads)
    prow=$((row/thr/100))
    seq $((thr*100)) | parallel mxn $prow $col $sep
    mxn $((row-prow*thr*100)) $col $sep
}

demo() {
    # Generate table X by Y
    row="$1"
    col="$2"
    sep="$3"

    # Generate string "- - - - " for each col
    paste_format_string=$(perl -e 'print map {"- "} 1..'$col)
    # Generate dummy values
    while seq 123456; do true; done |
	# Use paste's format string to make $col columns
        paste -d "$sep" $paste_format_string |
	# Keep the top $row rows
        head -n "$row"
}

usage() {
    echo "Usage: $0 [-d delimiter] [-b blocksize]" 1>&2; exit 1;
}

version() {
    cat <<EOF
transpose 20201130
Copyright (C) 2020 Ole Tange, http://ole.tange.dk
License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
transpose comes with no warranty.

Web site: https://gitlab.com/ole.tange/tangetools/-/tree/master/transpose
EOF
}

main() {
    block_size=100M
    while getopts ":b:d:V" o; do
	case "$o" in
	    (d)
		# Convert \t to TAB using printf
		d=$(printf "$OPTARG")
		if [ "'" = "$d" ] ; then
		    echo "Delimiter cannot be '"
		    usage
		    exit 0
		fi
		;;
	    (b)
		block_size="$OPTARG"
		;;
	    (V)
		version
		exit 0
		;;
	    (*)
		usage
		;;
	esac
    done
    shift $((OPTIND-1))
    file="$1"
    sep="$d"
    # Put all tempfiles into a single dir
    export TMPDIR=`mktemp -d`
    first_lines=$(tempfile)
    if [ -z "$file" ]; then
	if [ -z "$sep" ] ; then
	    sep=$(stdin_detect_sep $first_lines)
	    if [ -z "$sep" ] ; then
		echo "transpose: Cannot autodetect separator. Use -d" >&2
		exit 1
	    fi
	fi
	if [ "$sep" == '\s+' ] ; then
	    # Multiple spaces = separator
	    osep=" "
	else
	    osep="$sep"
	fi
	# Prepend stdin with the lines read in stdin_detect_sep
	(cat "$first_lines"; rm "$first_lines"; cat) |
	    make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
    else
	if [ -z "$sep" ] ; then
	    sep=$(stdin_detect_sep < "$file" "$first_lines")
	    if [ -z "$sep" ] ; then
		echo "transpose: Cannot autodetect separator. Use -d" >&2
		exit 1
	    fi
	fi
	rm "$first_lines"
	if [ "$sep" == '\s+' ] ; then
	    # Multiple spaces = separator
	    osep=" "
	else
	    osep="$sep"
	fi
	make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
    fi
    rmdir "$TMPDIR" 2>/dev/null
}

# Make sure the whole file is read before starting
main "$@"