transpose: Autofind separator.

2020-11-28 21:39:35 +01:00 · 2020-11-28 21:39:35 +01:00 · 66be041e30
parent 39f19757ae
commit 66be041e30
3 changed files with 301 additions and 185 deletions
--- a/9
+++ b/9
@ -1,6 +1,6 @@
 CMD = blink 2grep 2search burncpu drac duplicate-packets em emoticons	\
 	encdir field find-first-fail forever fxkill G gitnext gitundo	\
-	goodpasswd histogram Loffice mtrr mirrorpdf neno off parsort	\
+	goodpasswd histogram Loffice mtrr mirrorpdf neno not off	\
 	pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean	\
 	rina rn rrm seekmaniac shython sound-reload splitvideo stdout	\
 	swapout T teetime timestamp tracefile transpose upsidedown	\
@ -12,10 +12,9 @@ all: blink/blink.1 2search/2grep.1 2search/2search.1			\
 	find-first-fail/find-first-fail.1 G/G.1 gitnext/gitnext.1	\
 	gitundo/gitundo.1 goodpasswd/goodpasswd.1			\
 	histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1		\
-	off/off.1 parsort/parsort.1 pdfman/pdfman.1 pidcmd/pidcmd.1	\
-	pidtree/pidtree.1 plotpipe/plotpipe.1 puniq/puniq.1		\
-	rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1			\
-	seekmaniac/seekmaniac.1 shython/shython.1			\
+	off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1	\
+	plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1	\
+	rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1	\
 	sound-reload/sound-reload.1 splitvideo/splitvideo.1		\
 	stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1		\
 	tracefile/tracefile.1 transpose/transpose.1 T/T.1		\
--- a/transpose/transpose
+++ b/transpose/transpose
@ -1,7 +1,8 @@
 #!/bin/bash

-: <<=cut
+: <<'_EOS'
 =pod
+=cut

 =head1 NAME

@ -9,30 +10,40 @@ transpose - transpose CSV file

 =head1 SYNOPSIS

-B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input>]
+... | B<transpose> [-d I<delim>] [-b I<blocksize>]
+
+B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input.csv>]
+

 =head1 DESCRIPTION

-B<transpose> will read a CSV fie
+B<transpose> will read a CSV file and write the transposed version of
+the file. I.e. rows will be columns, and columns will be rows.

 =head1 OPTIONS

 =over 9

-=item I<input>
+=item I<input.csv>

-Input CSV file. If none is given reads from STDIN (standard input).
+Input CSV file. If none is given B<transpose> reads from STDIN
+(standard input).


 =item B<-d> I<delim>

-Use I<delim> as delimiter in input and output.
+Use I<delim> as delimiter in input and output. If no delimiter is
+given, B<transpose> will read the first 3 rows and try to guess the
+delimiter.
+
+The autodetection does not work well if values contain a quoted
+delimiter: E.g. a,"value with quoted ,",other value


 =item B<-b> I<blocksize>

 Pass chunks of I<blocksize> bytes to the internal transposer. Memory
-usage will be 10 times I<blocksiz> per CPU core. Default is 10M.
+usage will be 10 times I<blocksize> per CPU core. Default is 100M.


 =back
@ -40,18 +51,60 @@ usage will be 10 times I<blocksiz> per CPU core. Default is 10M.

 =head1 EXAMPLES

-=head2 EXAMPLE: Transpose a medium sized TSV file
+=head2 EXAMPLE: Transpose a TSV file
+
+    cat normal.tsv | transpose -d '\t' > transposed.tsv
+
+=head2 EXAMPLE: transpose 57 GB 100000x100000 CSV file
+
+    # Generate 100000x100000 matrix
+    100kx100k() {
+        100000x() {
+            while seq 123456 | shuf; do true; done |
+                paste $(perl -e 'print map {"- "} 1..100000') |
+                head -n $1
+        }
+        export -f 100000x
+        seq 1000 | parallel --nice 18 --delay 0.05 --files 100000x 100 |
+            parallel -uj1 'cat {}; nice rm {} &'
+    }
+    100kx100k > 100kx100k
+    # Transpose it
+    transpose 100kx100k > 100kx100k.t
+
+This takes around 700 MB/core and 20 minutes to run on 64C64T.
+
+
+=head1 LIMITATIONS
+
+B<transpose> is tested on a 1000000x1000000 3.6 TB table.
+
+There is a limit on how many filehandles can be used in super_paste.
+This is probably in the order of 1000000. This limits is the number of
+temporary files. By increasing the block size the number of temporary
+files will be lowered. The 3.6 TB test resulted in 36000 files, so if
+the limit is 1000000 files, it should work fine up to 100 TB before
+you need to increase the block size.

-    cat medium.tsv | transpose -d '\t' > muidem.tsv

 =head1 DESIGN

 B<transpose> is designed to deal efficiently with medium sized data
-(up to 30 TB per file) on systems with 100 MB RAM per CPU core. It
-works by chopping the input into 10 MB blocks. Each block is
+(up to 30 TB files) on systems with 2 GB RAM per CPU core. It works by
+chopping the input into blocks (default: 100 MB). Each block is
 transposed in parallel and saved to disk. Then these files are pasted
 together and finally removed.

+B<transpose> uses B<csvtool> if installed and a (slower) perl script
+otherwise.
+
+
+=head1 BUGS
+
+B<transpose> makes files in $TMPDIR (default: /tmp). These are not
+cleaned up, if B<transpose> is stopped abnormally (e.g. killed).
+
+
 =head1 REPORTING BUGS

 Report bugs to <tange@gnu.org>.
@ -59,7 +112,7 @@ Report bugs to <tange@gnu.org>.

 =head1 AUTHOR

-Copyright (C) 2013-2018 Ole Tange, http://ole.tange.dk and Free
+Copyright (C) 2013-2020 Ole Tange, http://ole.tange.dk and Free
 Software Foundation, Inc.


@ -178,22 +231,34 @@ B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.

 =head1 SEE ALSO

-B<bash>(1), B<parallel>(1), B<paste>(1)
+B<csvtool>(1), B<bash>(1), B<parallel>(1), B<paste>(1)

 =cut
+_EOS
+#'

+# Timings: 100kx200k (114GB) 34min
+# 200kx200k (228GB) 63min

-# transpose [-d delimiter] [-b blocksize] table.csv > transposed.csv
-# cat table.csv | transpose [-d delimiter] [-b blocksize] > transposed.csv
-
-transpose_inner() {
-    # simple in-memory transpose
-    # -d sep
-    # Input:
+transpose_perl() {
+    # Simple in-memory transpose
+    # Standard input:
    #   data to be transposed
-    # Output:
+    # Standard output:
    #   transposed data
-    perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
+    sep="$1"
+    shift
+
+    if [ "$sep" == '\s+' ] ; then
+	# Multiple spaces = separator
+	space_merger() { tr '	 ' ' ' | tr -s ' '; }
+	sep=" "
+    else
+	space_merger() { cat; }
+    fi
+    space_merger | perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
+#!/usr/bin/perl
+
 use Text::CSV;
 use Getopt::Long;

@ -233,103 +298,225 @@ sub simple {
    print map { join($sep,@$_),"\n" } @table;
 }
 cut-here-UbsAqi0j6GoOuk5W5yWA
-	  ) "$@"
+	  ) -d "$sep" "$@"
 }
-export -f transpose_inner
+export -f transpose_perl

-stdin_to_paste_files() {
-    # Run transpose_inner on blocks from stdin
-    # output each block as file name
-    local block_size
-    local sep
-    block_size="$1"
-    sep="$2"
-    PARALLEL="-k --files --block $block_size" \
-	    parallel --pipe transpose_inner -d "'$sep'"
+transpose_csvtool() {
+    # Use cvstool to transpose
+    # Standard input:
+    #   data to be transposed
+    # Standard output:
+    #   transposed data
+    sep="$1"
+    if [ "$sep" == "\s+" ] ; then
+	# Multiple spaces = separator
+	tr '	 ' ' ' | tr -s ' ' |
+	    csvtool transpose -t " " -u " " -
+    else
+	csvtool transpose -t "$sep" -u "$sep" -
+    fi
+}
+export -f transpose_csvtool
+
+detect_transposer() {
+    # Find the fastest transpose tool installed
+    if bash -c 'echo 1 | transpose_csvtool ,' >/dev/null 2>&1; then
+	echo transpose_csvtool
+    else
+	if bash -c 'echo 1 | transpose_perl ,' >/dev/null 2>&1; then
+	    echo transpose_perl
+	else
+	    echo Error: You need the Perl Text::CSV module or csvtool. >&2
+	fi
+    fi
 }

-file_to_paste_files() {
-    # Run transpose_inner on blocks from $file
-    # output each block as file name
-    local block_size
-    local sep
+make_paste_files() {
+    # Transpose input in blocks
+    # Output:
+    #   each transposed block as file name
    block_size="$1"
    sep="$2"
    file="$3"
-    PARALLEL="-k --files --block $block_size" \
-	    parallel --pipe-part -a "$file" transpose_inner -d "'$sep'"
+    transposer=$(detect_transposer)
+    par_opt="-k --files --block $block_size"
+    if [ -z "$file" ]; then
+	parallel $par_opt --pipe "$transposer" "'$sep'"
+    else
+	parallel $par_opt --pipe-part -a "$file" "$transposer" "'$sep'"
+    fi
 }

 super_paste() {
    # Like 'paste' up to 1000000 files
    # More than 250000 files requires extra filehandles for GNU Parallel
    # The files are read from stdin
-    local sep
-    local paste_files
-    local fifo
+    cleanup() {
+	printf "\rSIGINT caught      \n" >&2
+	(rm -rf "$TMPDIR" &)&
+	exit 1
+    }

-other_commands() {
-    printf "\rSIGINT caught      "
-    ls -l $paste_files
-    cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
-    rm $paste_files
-}
-
-    trap 'other_commands' SIGINT
+    trap 'cleanup' SIGINT

    sep="$1"
-    paste_files=`tempfile`
+    paste_files=$(tempfile)
    # basename
-    fifo=`tempfile`
-    rm $fifo
+    fifo=$(tempfile)
+    rm "$fifo"
    # Group files from stdin in groups of 1000 files
-    parallel -k -n1000 echo > $paste_files
+    parallel -k -n1000 echo > "$paste_files"

    # Define replacement string {0#} to 0-pad job number
    export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
                    $_=sprintf("%0${f}d",seq())'\'

    # Make fifos that can be read from
-    cat $paste_files | parallel "rm -f $fifo{0#}; mkfifo $fifo{0#}"
+    cat "$paste_files" | parallel "rm -f '$fifo'{0#}; mkfifo '$fifo'{0#}"

    # Start a paste process for every 1000 files
-    cat $paste_files | parallel -j0 "eval paste -d \''$sep'\' {} > $fifo{0#}" &
+    cat "$paste_files" | parallel -j0 "eval paste -d \''$sep'\' {} > '$fifo'{0#}" &

    # Paste all the fifos
-    eval paste -d "'$sep'" $fifo*
+    eval paste -d "'$sep'" "$fifo"*

    # Cleanup
-    cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
-    rm $paste_files
+    cat "$paste_files" | parallel "eval rm -f {} '$fifo'{0#}"
+    rm "$paste_files"
 }

 stdin_detect_sep() {
-    # Read the first 3 lines and detect the separator
-    # Save the read input to file
-    local file
-    file="$1"
-    # TODO
-    echo "$d"
+    # Read the first 3 lines of stdin and detect the separator
+    # Only , space tab ; | :  \0 and whitespace are detected
+    # Save the 3 lines input to file so it can be read again later
+    perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
+#!/usr/bin/perl
+
+sub max(@) {
+    # Returns:
+    #   Maximum value of array
+    my $max;
+    for (@_) {
+        # Skip undefs
+        defined $_ or next;
+        defined $max or do { $max = $_; next; }; # Set $_ to the first non-undef
+        $max = ($max > $_) ? $max : $_;
+    }
+    return $max;
+}
+
+sub find_sep(@) {
+    # Try common find the separators.
+    # Do we get the same for each line?
+    my @csv = grep { not /^#/ } @_;
+    my @sep = (",", "\t", ";", ' ', '\|', ':', "\0", '\s+');
+    my $columns;
+    my %col;
+    for my $sep (@sep) {
+	for my $line (@csv) {
+	    $columns = split /$sep/, $line;
+	    if($columns > 1) {
+		$col{$sep."\0".$columns}++
+	    }
+	}
+    }
+    # Find max $col{$sep,$columns}
+    my $most_lines = max(values %col);
+
+    my %sepcol = (map { split /\0/, $_ }
+		  grep { $col{$_} == $most_lines } keys %col);
+    my $most_cols = max(values %sepcol);
+    return ((grep { $sepcol{$_} == $most_cols } keys %sepcol)[0]);
+}
+
+my $buf = "";
+my $newlines = 0;
+open(OUT, "+>", shift) || die;
+# Copy (at least) 3 lines to OUT
+while(sysread(STDIN,$buf,131072)) {
+    print OUT $buf;
+    $newlines += $buf =~ tr/\n/\n/;
+    if($newlines >= 3) {
+	last;
+    }
+}
+seek(OUT,0,0) || die;
+my @lines = <OUT>;
+close OUT;
+# Remove last half-line
+pop @lines;
+print find_sep(@lines);
+
+cut-here-UbsAqi0j6GoOuk5W5yWA
+	  ) "$@"
+}
+
+matrix() {
+    # Generate table X by Y
+    row="$1"
+    col="$2"
+    sep="$3"
+
+    mxn() { perl -E 'for(1..'$1') { say join "'$3'", map {int(rand()*123456)} 1..'$2' } '; }
+    export -f mxn
+    thr=$(parallel --number-of-threads)
+    prow=$((row/thr/100))
+    seq $((thr*100)) | parallel mxn $prow $col $sep
+    mxn $((row-prow*thr*100)) $col $sep
+}
+
+demo() {
+    # Generate table X by Y
+    row="$1"
+    col="$2"
+    sep="$3"
+
+    # Generate string "- - - - " for each col
+    paste_format_string=$(perl -e 'print map {"- "} 1..'$col)
+    # Generate dummy values
+    while seq 123456; do true; done |
+	# Use paste's format string to make $col columns
+        paste -d "$sep" $paste_format_string |
+	# Keep the top $row rows
+        head -n "$row"
 }

 usage() {
    echo "Usage: $0 [-d delimiter] [-b blocksize]" 1>&2; exit 1;
 }

+version() {
+    cat <<EOF
+transpose 20201130
+Copyright (C) 2020 Ole Tange, http://ole.tange.dk
+License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>
+This is free software: you are free to change and redistribute it.
+GNU parallel comes with no warranty.
+
+Web site: https://gitlab.com/ole.tange/tangetools/-/tree/master/transpose
+EOF
+}
+
 main() {
-    block_size=10M
-    while getopts ":b:d:" o; do
-	case "${o}" in
+    block_size=100M
+    while getopts ":b:d:V" o; do
+	case "$o" in
 	    d)
-		d="$(printf "${OPTARG}")"
-		if [ "'" = "${d}" ] ; then
+		# Convert \t to TAB using printf
+		d=$(printf "$OPTARG")
+		if [ "'" = "$d" ] ; then
 		    echo "Delimiter cannot be '"
 		    usage
-		    exit
+		    exit 0
 		fi
 		;;
 	    b)
-		block_size="${OPTARG}"
+		block_size="$OPTARG"
+		;;
+	    V)
+		version
+		exit 0
 		;;
 	    *)
 		usage
@ -337,22 +524,46 @@ main() {
 	esac
    done
    shift $((OPTIND-1))
-
-    if [ -z "${d}" ] ; then
-	d="$(printf "\t")"
-    fi
-
-    file="$@"
-    first_lines=`tempfile`
+    file="$1"
+    sep="$d"
+    # Put all tempfiles into a single dir
+    export TMPDIR=`mktemp -d`
+    first_lines=$(tempfile)
    if [ -z "$file" ]; then
-	sep="$(stdin_detect_sep $first_lines)"
-	(cat $first_lines; rm $first_lines; cat) |
-	    stdin_to_paste_files $block_size "$sep" | super_paste "$sep"
+	if [ -z "$sep" ] ; then
+	    sep=$(stdin_detect_sep $first_lines)
+	    if [ -z "$sep" ] ; then
+		echo "transpose: Cannot autodetect separator. Use -d" >&2
+		exit 1
+	    fi
+	fi
+	if [ "$sep" == '\s+' ] ; then
+	    # Multiple spaces = separator
+	    osep=" "
+	else
+	    osep="$sep"
+	fi
+	# Prepend stdin with the lines read in stdin_detect_sep
+	(cat "$first_lines"; rm "$first_lines"; cat) |
+	    make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
    else
-	sep="$(stdin_detect_sep < "$file" $first_lines)"
-	rm $first_lines
-	file_to_paste_files $block_size "$sep" "$file" | super_paste "$sep"
+	if [ -z "$sep" ] ; then
+	    sep=$(stdin_detect_sep < "$file" "$first_lines")
+	    if [ -z "$sep" ] ; then
+		echo "transpose: Cannot autodetect separator. Use -d" >&2
+		exit 1
+	    fi
+	fi
+	rm "$first_lines"
+	if [ "$sep" == '\s+' ] ; then
+	    # Multiple spaces = separator
+	    osep=" "
+	else
+	    osep="$sep"
+	fi
+	make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
    fi
+    rmdir "$TMPDIR" 2>/dev/null
 }

 # Make sure the whole file is read before starting
--- a/transpose/transpose-par.pl
+++ b/transpose/transpose-par.pl
@ -1,94 +0,0 @@
-#!/usr/bin/perl
-
-use Text::CSV;
-use File::Temp qw(tempfile tempdir);
-
-my $csv;
-my (@table);
-my $first_line = 1;
-my $col = 0;
-while(my $l = <>) {
-    if($first_line) {
-	my $csv_setting = guess_csv_setting($l);
-	$csv = Text::CSV->new($csv_setting)
-	    or die "Cannot use CSV: ".Text::CSV->error_diag ();
-	$first_line = 0;
-    }
-    if(not $csv->parse($l)) {
-	die "CSV has unexpected format";
-    }
-    # append to each row
-    my $row = 0;
-    
-    for($csv->fields()) {
-	$table[$row][$col] = defined($_) ? $_ : '';
-	$row++;
-    }
-    $col++;
-}
-
-print map { join("\t",@$_),"\n" } @table;
-
-sub guess_csv_setting {
-    # Based on two lines guess the csv_setting
-    my $line = shift;
-    # Potential field separators
-    # Priority:
-    # \0 if both lines have the same number
-    # \t if both lines have the same number
-    my @fieldsep = (",", "\t", "\0", ":", ";", "|", "/");
-    my %count;
-    @count{@fieldsep} = (0,0,0,0,0,0);
-    # Count characters
-    map { $count{$_}++ } split //,$line;
-    my @sepsort = sort { $count{$b} <=> $count{$a} } @fieldsep;
-    my $guessed_sep;
-    if($count{"\0"} > 0) {
-	# \0 is in the line => this is definitely the field sep
-	$guessed_sep = "\0";
-    } elsif($count{"\t"} > 0) {
-	# \t is in the line => this is definitely the field sep
-	$guessed_sep = "\t";
-    } else {
-	$guessed_sep = $sepsort[0];
-    }
-    return { binary => 1, sep_char => $guessed_sep };
-}
-
-sub _guess_csv_setting {
-    # Try different csv_settings
-    # Return a $csv object with the best setting
-    my @csv_file_types = 
-	( { binary => 1, sep_char => "\0" },
-	  { binary => 1, sep_char => "\t" },
-	  { binary => 1, sep_char => "," },
-	  { binary => 1 },
-	);
-
-    my $succesful_csv_type;
-    my $csv;
-    for my $csv_file_type (@csv_file_types) {
-	$csv = Text::CSV->new ( $csv_file_type )
-	    or die "Cannot use CSV: ($csv_file_type) ".Text::CSV->error_diag ();
-	$succesful_csv_type = $csv_file_type;
-	my $last_n_fields;
-	for my $line (@lines) {
-	    if($csv->parse($line)) {
-		my $n_fields = ($csv->fields());
-		$last_fields ||= $n_fields;
-		
-	    } else{
-		$succesful_csv_type = 0;
-		last;
-	    }
-	}
-	
-    }
-    if(not $succesful_csv_type) {
-	$csv->error_diag();
-    }
-    
-    $csv = Text::CSV->new ( $succesful_csv_type )  # should set binary attribute.
-	or die "Cannot use CSV: ".Text::CSV->error_diag ();
-    return($csv);
-}