transpose: Autofind separator.

2020-11-28 21:39:35 +01:00 · 2020-11-28 21:39:35 +01:00 · 66be041e30
parent 39f19757ae
commit 66be041e30
3 changed files with 301 additions and 185 deletions
--- a/9
+++ b/9
@ -1,6 +1,6 @@
 CMD = blink 2grep 2search burncpu drac duplicate-packets em emoticons	\
 	encdir field find-first-fail forever fxkill G gitnext gitundo	\
-	goodpasswd histogram Loffice mtrr mirrorpdf neno off parsort	\
+	goodpasswd histogram Loffice mtrr mirrorpdf neno not off	\
 	pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean	\
 	rina rn rrm seekmaniac shython sound-reload splitvideo stdout	\
 	swapout T teetime timestamp tracefile transpose upsidedown	\
@ -12,10 +12,9 @@ all: blink/blink.1 2search/2grep.1 2search/2search.1			\
 	find-first-fail/find-first-fail.1 G/G.1 gitnext/gitnext.1	\
 	gitundo/gitundo.1 goodpasswd/goodpasswd.1			\
 	histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1		\
-	off/off.1 parsort/parsort.1 pdfman/pdfman.1 pidcmd/pidcmd.1	\
+	off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1	\
-	pidtree/pidtree.1 plotpipe/plotpipe.1 puniq/puniq.1		\
+	plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1	\
-	rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1			\
+	rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1	\
 	seekmaniac/seekmaniac.1 shython/shython.1			\
 	sound-reload/sound-reload.1 splitvideo/splitvideo.1		\
 	stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1		\
 	tracefile/tracefile.1 transpose/transpose.1 T/T.1		\
--- a/transpose/transpose
+++ b/transpose/transpose
@ -1,7 +1,8 @@
 #!/bin/bash
-: <<=cut
+: <<'_EOS'
 =pod
 =cut
 =head1 NAME
@ -9,30 +10,40 @@ transpose - transpose CSV file
 =head1 SYNOPSIS
-B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input>]
+... | B<transpose> [-d I<delim>] [-b I<blocksize>]
 B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input.csv>]
 =head1 DESCRIPTION
-B<transpose> will read a CSV fie
+B<transpose> will read a CSV file and write the transposed version of
 the file. I.e. rows will be columns, and columns will be rows.
 =head1 OPTIONS
 =over 9
-=item I<input>
+=item I<input.csv>
-Input CSV file. If none is given reads from STDIN (standard input).
+Input CSV file. If none is given B<transpose> reads from STDIN
 (standard input).
 =item B<-d> I<delim>
-Use I<delim> as delimiter in input and output.
+Use I<delim> as delimiter in input and output. If no delimiter is
 given, B<transpose> will read the first 3 rows and try to guess the
 delimiter.
 The autodetection does not work well if values contain a quoted
 delimiter: E.g. a,"value with quoted ,",other value
 =item B<-b> I<blocksize>
 Pass chunks of I<blocksize> bytes to the internal transposer. Memory
-usage will be 10 times I<blocksiz> per CPU core. Default is 10M.
+usage will be 10 times I<blocksize> per CPU core. Default is 100M.
 =back
@ -40,18 +51,60 @@ usage will be 10 times I<blocksiz> per CPU core. Default is 10M.
 =head1 EXAMPLES
-=head2 EXAMPLE: Transpose a medium sized TSV file
+=head2 EXAMPLE: Transpose a TSV file
    cat normal.tsv | transpose -d '\t' > transposed.tsv
 =head2 EXAMPLE: transpose 57 GB 100000x100000 CSV file
    # Generate 100000x100000 matrix
    100kx100k() {
        100000x() {
            while seq 123456 | shuf; do true; done |
                paste $(perl -e 'print map {"- "} 1..100000') |
                head -n $1
        }
        export -f 100000x
        seq 1000 | parallel --nice 18 --delay 0.05 --files 100000x 100 |
            parallel -uj1 'cat {}; nice rm {} &'
    }
    100kx100k > 100kx100k
    # Transpose it
    transpose 100kx100k > 100kx100k.t
 This takes around 700 MB/core and 20 minutes to run on 64C64T.
 =head1 LIMITATIONS
 B<transpose> is tested on a 1000000x1000000 3.6 TB table.
 There is a limit on how many filehandles can be used in super_paste.
 This is probably in the order of 1000000. This limits is the number of
 temporary files. By increasing the block size the number of temporary
 files will be lowered. The 3.6 TB test resulted in 36000 files, so if
 the limit is 1000000 files, it should work fine up to 100 TB before
 you need to increase the block size.
    cat medium.tsv | transpose -d '\t' > muidem.tsv
 =head1 DESIGN
 B<transpose> is designed to deal efficiently with medium sized data
-(up to 30 TB per file) on systems with 100 MB RAM per CPU core. It
+(up to 30 TB files) on systems with 2 GB RAM per CPU core. It works by
-works by chopping the input into 10 MB blocks. Each block is
+chopping the input into blocks (default: 100 MB). Each block is
 transposed in parallel and saved to disk. Then these files are pasted
 together and finally removed.
 B<transpose> uses B<csvtool> if installed and a (slower) perl script
 otherwise.
 =head1 BUGS
 B<transpose> makes files in $TMPDIR (default: /tmp). These are not
 cleaned up, if B<transpose> is stopped abnormally (e.g. killed).
 =head1 REPORTING BUGS
 Report bugs to <tange@gnu.org>.
@ -59,7 +112,7 @@ Report bugs to <tange@gnu.org>.
 =head1 AUTHOR
-Copyright (C) 2013-2018 Ole Tange, http://ole.tange.dk and Free
+Copyright (C) 2013-2020 Ole Tange, http://ole.tange.dk and Free
 Software Foundation, Inc.
@ -178,22 +231,34 @@ B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
 =head1 SEE ALSO
-B<bash>(1), B<parallel>(1), B<paste>(1)
+B<csvtool>(1), B<bash>(1), B<parallel>(1), B<paste>(1)
 =cut
 _EOS
 #'
 # Timings: 100kx200k (114GB) 34min
 # 200kx200k (228GB) 63min
-# transpose [-d delimiter] [-b blocksize] table.csv > transposed.csv
+transpose_perl() {
-# cat table.csv | transpose [-d delimiter] [-b blocksize] > transposed.csv
+    # Simple in-memory transpose
-
+    # Standard input:
 transpose_inner() {
    # simple in-memory transpose
    # -d sep
    # Input:
    #   data to be transposed
-    # Output:
+    # Standard output:
    #   transposed data
-    perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
+    sep="$1"
    shift
    if [ "$sep" == '\s+' ] ; then
 	# Multiple spaces = separator
 	space_merger() { tr '	 ' ' ' | tr -s ' '; }
 	sep=" "
    else
 	space_merger() { cat; }
    fi
    space_merger | perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
 #!/usr/bin/perl
 use Text::CSV;
 use Getopt::Long;
@ -233,103 +298,225 @@ sub simple {
    print map { join($sep,@$_),"\n" } @table;
 }
 cut-here-UbsAqi0j6GoOuk5W5yWA
-	  ) "$@"
+	  ) -d "$sep" "$@"
 }
-export -f transpose_inner
+export -f transpose_perl
-stdin_to_paste_files() {
+transpose_csvtool() {
-    # Run transpose_inner on blocks from stdin
+    # Use cvstool to transpose
-    # output each block as file name
+    # Standard input:
-    local block_size
+    #   data to be transposed
-    local sep
+    # Standard output:
-    block_size="$1"
+    #   transposed data
-    sep="$2"
+    sep="$1"
-    PARALLEL="-k --files --block $block_size" \
+    if [ "$sep" == "\s+" ] ; then
-	    parallel --pipe transpose_inner -d "'$sep'"
+	# Multiple spaces = separator
 	tr '	 ' ' ' | tr -s ' ' |
 	    csvtool transpose -t " " -u " " -
    else
 	csvtool transpose -t "$sep" -u "$sep" -
    fi
 }
 export -f transpose_csvtool
 detect_transposer() {
    # Find the fastest transpose tool installed
    if bash -c 'echo 1 | transpose_csvtool ,' >/dev/null 2>&1; then
 	echo transpose_csvtool
    else
 	if bash -c 'echo 1 | transpose_perl ,' >/dev/null 2>&1; then
 	    echo transpose_perl
 	else
 	    echo Error: You need the Perl Text::CSV module or csvtool. >&2
 	fi
    fi
 }
-file_to_paste_files() {
+make_paste_files() {
-    # Run transpose_inner on blocks from $file
+    # Transpose input in blocks
-    # output each block as file name
+    # Output:
-    local block_size
+    #   each transposed block as file name
    local sep
    block_size="$1"
    sep="$2"
    file="$3"
-    PARALLEL="-k --files --block $block_size" \
+    transposer=$(detect_transposer)
-	    parallel --pipe-part -a "$file" transpose_inner -d "'$sep'"
+    par_opt="-k --files --block $block_size"
    if [ -z "$file" ]; then
 	parallel $par_opt --pipe "$transposer" "'$sep'"
    else
 	parallel $par_opt --pipe-part -a "$file" "$transposer" "'$sep'"
    fi
 }
 super_paste() {
    # Like 'paste' up to 1000000 files
    # More than 250000 files requires extra filehandles for GNU Parallel
    # The files are read from stdin
-    local sep
+    cleanup() {
-    local paste_files
+	printf "\rSIGINT caught      \n" >&2
-    local fifo
+	(rm -rf "$TMPDIR" &)&
-
+	exit 1
 other_commands() {
    printf "\rSIGINT caught      "
    ls -l $paste_files
    cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
    rm $paste_files
    }
-    trap 'other_commands' SIGINT
+    trap 'cleanup' SIGINT
    sep="$1"
-    paste_files=`tempfile`
+    paste_files=$(tempfile)
    # basename
-    fifo=`tempfile`
+    fifo=$(tempfile)
-    rm $fifo
+    rm "$fifo"
    # Group files from stdin in groups of 1000 files
-    parallel -k -n1000 echo > $paste_files
+    parallel -k -n1000 echo > "$paste_files"
    # Define replacement string {0#} to 0-pad job number
    export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
                    $_=sprintf("%0${f}d",seq())'\'
    # Make fifos that can be read from
-    cat $paste_files | parallel "rm -f $fifo{0#}; mkfifo $fifo{0#}"
+    cat "$paste_files" | parallel "rm -f '$fifo'{0#}; mkfifo '$fifo'{0#}"
    # Start a paste process for every 1000 files
-    cat $paste_files | parallel -j0 "eval paste -d \''$sep'\' {} > $fifo{0#}" &
+    cat "$paste_files" | parallel -j0 "eval paste -d \''$sep'\' {} > '$fifo'{0#}" &
    # Paste all the fifos
-    eval paste -d "'$sep'" $fifo*
+    eval paste -d "'$sep'" "$fifo"*
    # Cleanup
-    cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
+    cat "$paste_files" | parallel "eval rm -f {} '$fifo'{0#}"
-    rm $paste_files
+    rm "$paste_files"
 }
 stdin_detect_sep() {
-    # Read the first 3 lines and detect the separator
+    # Read the first 3 lines of stdin and detect the separator
-    # Save the read input to file
+    # Only , space tab ; | :  \0 and whitespace are detected
-    local file
+    # Save the 3 lines input to file so it can be read again later
-    file="$1"
+    perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
-    # TODO
+#!/usr/bin/perl
-    echo "$d"
+
 sub max(@) {
    # Returns:
    #   Maximum value of array
    my $max;
    for (@_) {
        # Skip undefs
        defined $_ or next;
        defined $max or do { $max = $_; next; }; # Set $_ to the first non-undef
        $max = ($max > $_) ? $max : $_;
    }
    return $max;
 }
 sub find_sep(@) {
    # Try common find the separators.
    # Do we get the same for each line?
    my @csv = grep { not /^#/ } @_;
    my @sep = (",", "\t", ";", ' ', '\|', ':', "\0", '\s+');
    my $columns;
    my %col;
    for my $sep (@sep) {
 	for my $line (@csv) {
 	    $columns = split /$sep/, $line;
 	    if($columns > 1) {
 		$col{$sep."\0".$columns}++
 	    }
 	}
    }
    # Find max $col{$sep,$columns}
    my $most_lines = max(values %col);
    my %sepcol = (map { split /\0/, $_ }
 		  grep { $col{$_} == $most_lines } keys %col);
    my $most_cols = max(values %sepcol);
    return ((grep { $sepcol{$_} == $most_cols } keys %sepcol)[0]);
 }
 my $buf = "";
 my $newlines = 0;
 open(OUT, "+>", shift) || die;
 # Copy (at least) 3 lines to OUT
 while(sysread(STDIN,$buf,131072)) {
    print OUT $buf;
    $newlines += $buf =~ tr/\n/\n/;
    if($newlines >= 3) {
 	last;
    }
 }
 seek(OUT,0,0) || die;
 my @lines = <OUT>;
 close OUT;
 # Remove last half-line
 pop @lines;
 print find_sep(@lines);
 cut-here-UbsAqi0j6GoOuk5W5yWA
 	  ) "$@"
 }
 matrix() {
    # Generate table X by Y
    row="$1"
    col="$2"
    sep="$3"
    mxn() { perl -E 'for(1..'$1') { say join "'$3'", map {int(rand()*123456)} 1..'$2' } '; }
    export -f mxn
    thr=$(parallel --number-of-threads)
    prow=$((row/thr/100))
    seq $((thr*100)) | parallel mxn $prow $col $sep
    mxn $((row-prow*thr*100)) $col $sep
 }
 demo() {
    # Generate table X by Y
    row="$1"
    col="$2"
    sep="$3"
    # Generate string "- - - - " for each col
    paste_format_string=$(perl -e 'print map {"- "} 1..'$col)
    # Generate dummy values
    while seq 123456; do true; done |
 	# Use paste's format string to make $col columns
        paste -d "$sep" $paste_format_string |
 	# Keep the top $row rows
        head -n "$row"
 }
 usage() {
    echo "Usage: $0 [-d delimiter] [-b blocksize]" 1>&2; exit 1;
 }
 version() {
    cat <<EOF
 transpose 20201130
 Copyright (C) 2020 Ole Tange, http://ole.tange.dk
 License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>
 This is free software: you are free to change and redistribute it.
 GNU parallel comes with no warranty.
 Web site: https://gitlab.com/ole.tange/tangetools/-/tree/master/transpose
 EOF
 }
 main() {
-    block_size=10M
+    block_size=100M
-    while getopts ":b:d:" o; do
+    while getopts ":b:d:V" o; do
-	case "${o}" in
+	case "$o" in
 	    d)
-		d="$(printf "${OPTARG}")"
+		# Convert \t to TAB using printf
-		if [ "'" = "${d}" ] ; then
+		d=$(printf "$OPTARG")
 		if [ "'" = "$d" ] ; then
 		    echo "Delimiter cannot be '"
 		    usage
-		    exit
+		    exit 0
 		fi
 		;;
 	    b)
-		block_size="${OPTARG}"
+		block_size="$OPTARG"
 		;;
 	    V)
 		version
 		exit 0
 		;;
 	    *)
 		usage
@ -337,22 +524,46 @@ main() {
 	esac
    done
    shift $((OPTIND-1))
-
+    file="$1"
-    if [ -z "${d}" ] ; then
+    sep="$d"
-	d="$(printf "\t")"
+    # Put all tempfiles into a single dir
-    fi
+    export TMPDIR=`mktemp -d`
-
+    first_lines=$(tempfile)
    file="$@"
    first_lines=`tempfile`
    if [ -z "$file" ]; then
-	sep="$(stdin_detect_sep $first_lines)"
+	if [ -z "$sep" ] ; then
-	(cat $first_lines; rm $first_lines; cat) |
+	    sep=$(stdin_detect_sep $first_lines)
-	    stdin_to_paste_files $block_size "$sep" | super_paste "$sep"
+	    if [ -z "$sep" ] ; then
-    else
+		echo "transpose: Cannot autodetect separator. Use -d" >&2
-	sep="$(stdin_detect_sep < "$file" $first_lines)"
+		exit 1
 	rm $first_lines
 	file_to_paste_files $block_size "$sep" "$file" | super_paste "$sep"
 	    fi
 	fi
 	if [ "$sep" == '\s+' ] ; then
 	    # Multiple spaces = separator
 	    osep=" "
 	else
 	    osep="$sep"
 	fi
 	# Prepend stdin with the lines read in stdin_detect_sep
 	(cat "$first_lines"; rm "$first_lines"; cat) |
 	    make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
    else
 	if [ -z "$sep" ] ; then
 	    sep=$(stdin_detect_sep < "$file" "$first_lines")
 	    if [ -z "$sep" ] ; then
 		echo "transpose: Cannot autodetect separator. Use -d" >&2
 		exit 1
 	    fi
 	fi
 	rm "$first_lines"
 	if [ "$sep" == '\s+' ] ; then
 	    # Multiple spaces = separator
 	    osep=" "
 	else
 	    osep="$sep"
 	fi
 	make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
    fi
    rmdir "$TMPDIR" 2>/dev/null
 }
 # Make sure the whole file is read before starting
--- a/transpose/transpose-par.pl
+++ b/transpose/transpose-par.pl
@ -1,94 +0,0 @@
 #!/usr/bin/perl
 use Text::CSV;
 use File::Temp qw(tempfile tempdir);
 my $csv;
 my (@table);
 my $first_line = 1;
 my $col = 0;
 while(my $l = <>) {
    if($first_line) {
 	my $csv_setting = guess_csv_setting($l);
 	$csv = Text::CSV->new($csv_setting)
 	    or die "Cannot use CSV: ".Text::CSV->error_diag ();
 	$first_line = 0;
    }
    if(not $csv->parse($l)) {
 	die "CSV has unexpected format";
    }
    # append to each row
    my $row = 0;
    for($csv->fields()) {
 	$table[$row][$col] = defined($_) ? $_ : '';
 	$row++;
    }
    $col++;
 }
 print map { join("\t",@$_),"\n" } @table;
 sub guess_csv_setting {
    # Based on two lines guess the csv_setting
    my $line = shift;
    # Potential field separators
    # Priority:
    # \0 if both lines have the same number
    # \t if both lines have the same number
    my @fieldsep = (",", "\t", "\0", ":", ";", "|", "/");
    my %count;
    @count{@fieldsep} = (0,0,0,0,0,0);
    # Count characters
    map { $count{$_}++ } split //,$line;
    my @sepsort = sort { $count{$b} <=> $count{$a} } @fieldsep;
    my $guessed_sep;
    if($count{"\0"} > 0) {
 	# \0 is in the line => this is definitely the field sep
 	$guessed_sep = "\0";
    } elsif($count{"\t"} > 0) {
 	# \t is in the line => this is definitely the field sep
 	$guessed_sep = "\t";
    } else {
 	$guessed_sep = $sepsort[0];
    }
    return { binary => 1, sep_char => $guessed_sep };
 }
 sub _guess_csv_setting {
    # Try different csv_settings
    # Return a $csv object with the best setting
    my @csv_file_types = 
 	( { binary => 1, sep_char => "\0" },
 	  { binary => 1, sep_char => "\t" },
 	  { binary => 1, sep_char => "," },
 	  { binary => 1 },
 	);
    my $succesful_csv_type;
    my $csv;
    for my $csv_file_type (@csv_file_types) {
 	$csv = Text::CSV->new ( $csv_file_type )
 	    or die "Cannot use CSV: ($csv_file_type) ".Text::CSV->error_diag ();
 	$succesful_csv_type = $csv_file_type;
 	my $last_n_fields;
 	for my $line (@lines) {
 	    if($csv->parse($line)) {
 		my $n_fields = ($csv->fields());
 		$last_fields ||= $n_fields;
 	    } else{
 		$succesful_csv_type = 0;
 		last;
 	    }
 	}
    }
    if(not $succesful_csv_type) {
 	$csv->error_diag();
    }
    $csv = Text::CSV->new ( $succesful_csv_type )  # should set binary attribute.
 	or die "Cannot use CSV: ".Text::CSV->error_diag ();
    return($csv);
 }