From 66be041e30097e77633006820bdda44aaaec492b Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Sat, 28 Nov 2020 21:39:35 +0100 Subject: [PATCH] transpose: Autofind separator. --- Makefile | 9 +- transpose/transpose | 383 ++++++++++++++++++++++++++++--------- transpose/transpose-par.pl | 94 --------- 3 files changed, 301 insertions(+), 185 deletions(-) delete mode 100755 transpose/transpose-par.pl diff --git a/Makefile b/Makefile index 87b6fd0..81284ac 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ CMD = blink 2grep 2search burncpu drac duplicate-packets em emoticons \ encdir field find-first-fail forever fxkill G gitnext gitundo \ - goodpasswd histogram Loffice mtrr mirrorpdf neno off parsort \ + goodpasswd histogram Loffice mtrr mirrorpdf neno not off \ pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean \ rina rn rrm seekmaniac shython sound-reload splitvideo stdout \ swapout T teetime timestamp tracefile transpose upsidedown \ @@ -12,10 +12,9 @@ all: blink/blink.1 2search/2grep.1 2search/2search.1 \ find-first-fail/find-first-fail.1 G/G.1 gitnext/gitnext.1 \ gitundo/gitundo.1 goodpasswd/goodpasswd.1 \ histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \ - off/off.1 parsort/parsort.1 pdfman/pdfman.1 pidcmd/pidcmd.1 \ - pidtree/pidtree.1 plotpipe/plotpipe.1 puniq/puniq.1 \ - rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 \ - seekmaniac/seekmaniac.1 shython/shython.1 \ + off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1 \ + plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1 \ + rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1 \ sound-reload/sound-reload.1 splitvideo/splitvideo.1 \ stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1 \ tracefile/tracefile.1 transpose/transpose.1 T/T.1 \ diff --git a/transpose/transpose b/transpose/transpose index 7739d4e..d155ac5 100755 --- a/transpose/transpose +++ b/transpose/transpose @@ -1,7 +1,8 @@ #!/bin/bash -: <<=cut +: <<'_EOS' =pod +=cut =head1 NAME @@ -9,30 +10,40 @@ transpose - transpose CSV file =head1 SYNOPSIS -B [-d I] [-b I] [I] +... | B [-d I] [-b I] + +B [-d I] [-b I] [I] + =head1 DESCRIPTION -B will read a CSV fie +B will read a CSV file and write the transposed version of +the file. I.e. rows will be columns, and columns will be rows. =head1 OPTIONS =over 9 -=item I +=item I -Input CSV file. If none is given reads from STDIN (standard input). +Input CSV file. If none is given B reads from STDIN +(standard input). =item B<-d> I -Use I as delimiter in input and output. +Use I as delimiter in input and output. If no delimiter is +given, B will read the first 3 rows and try to guess the +delimiter. + +The autodetection does not work well if values contain a quoted +delimiter: E.g. a,"value with quoted ,",other value =item B<-b> I Pass chunks of I bytes to the internal transposer. Memory -usage will be 10 times I per CPU core. Default is 10M. +usage will be 10 times I per CPU core. Default is 100M. =back @@ -40,18 +51,60 @@ usage will be 10 times I per CPU core. Default is 10M. =head1 EXAMPLES -=head2 EXAMPLE: Transpose a medium sized TSV file +=head2 EXAMPLE: Transpose a TSV file + + cat normal.tsv | transpose -d '\t' > transposed.tsv + +=head2 EXAMPLE: transpose 57 GB 100000x100000 CSV file + + # Generate 100000x100000 matrix + 100kx100k() { + 100000x() { + while seq 123456 | shuf; do true; done | + paste $(perl -e 'print map {"- "} 1..100000') | + head -n $1 + } + export -f 100000x + seq 1000 | parallel --nice 18 --delay 0.05 --files 100000x 100 | + parallel -uj1 'cat {}; nice rm {} &' + } + 100kx100k > 100kx100k + # Transpose it + transpose 100kx100k > 100kx100k.t + +This takes around 700 MB/core and 20 minutes to run on 64C64T. + + +=head1 LIMITATIONS + +B is tested on a 1000000x1000000 3.6 TB table. + +There is a limit on how many filehandles can be used in super_paste. +This is probably in the order of 1000000. This limits is the number of +temporary files. By increasing the block size the number of temporary +files will be lowered. The 3.6 TB test resulted in 36000 files, so if +the limit is 1000000 files, it should work fine up to 100 TB before +you need to increase the block size. - cat medium.tsv | transpose -d '\t' > muidem.tsv =head1 DESIGN B is designed to deal efficiently with medium sized data -(up to 30 TB per file) on systems with 100 MB RAM per CPU core. It -works by chopping the input into 10 MB blocks. Each block is +(up to 30 TB files) on systems with 2 GB RAM per CPU core. It works by +chopping the input into blocks (default: 100 MB). Each block is transposed in parallel and saved to disk. Then these files are pasted together and finally removed. +B uses B if installed and a (slower) perl script +otherwise. + + +=head1 BUGS + +B makes files in $TMPDIR (default: /tmp). These are not +cleaned up, if B is stopped abnormally (e.g. killed). + + =head1 REPORTING BUGS Report bugs to . @@ -59,7 +112,7 @@ Report bugs to . =head1 AUTHOR -Copyright (C) 2013-2018 Ole Tange, http://ole.tange.dk and Free +Copyright (C) 2013-2020 Ole Tange, http://ole.tange.dk and Free Software Foundation, Inc. @@ -178,22 +231,34 @@ B uses Perl, B, B and B. =head1 SEE ALSO -B(1), B(1), B(1) +B(1), B(1), B(1), B(1) =cut +_EOS +#' +# Timings: 100kx200k (114GB) 34min +# 200kx200k (228GB) 63min -# transpose [-d delimiter] [-b blocksize] table.csv > transposed.csv -# cat table.csv | transpose [-d delimiter] [-b blocksize] > transposed.csv - -transpose_inner() { - # simple in-memory transpose - # -d sep - # Input: +transpose_perl() { + # Simple in-memory transpose + # Standard input: # data to be transposed - # Output: + # Standard output: # transposed data - perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA' + sep="$1" + shift + + if [ "$sep" == '\s+' ] ; then + # Multiple spaces = separator + space_merger() { tr ' ' ' ' | tr -s ' '; } + sep=" " + else + space_merger() { cat; } + fi + space_merger | perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA' +#!/usr/bin/perl + use Text::CSV; use Getopt::Long; @@ -233,103 +298,225 @@ sub simple { print map { join($sep,@$_),"\n" } @table; } cut-here-UbsAqi0j6GoOuk5W5yWA - ) "$@" + ) -d "$sep" "$@" } -export -f transpose_inner +export -f transpose_perl -stdin_to_paste_files() { - # Run transpose_inner on blocks from stdin - # output each block as file name - local block_size - local sep - block_size="$1" - sep="$2" - PARALLEL="-k --files --block $block_size" \ - parallel --pipe transpose_inner -d "'$sep'" +transpose_csvtool() { + # Use cvstool to transpose + # Standard input: + # data to be transposed + # Standard output: + # transposed data + sep="$1" + if [ "$sep" == "\s+" ] ; then + # Multiple spaces = separator + tr ' ' ' ' | tr -s ' ' | + csvtool transpose -t " " -u " " - + else + csvtool transpose -t "$sep" -u "$sep" - + fi +} +export -f transpose_csvtool + +detect_transposer() { + # Find the fastest transpose tool installed + if bash -c 'echo 1 | transpose_csvtool ,' >/dev/null 2>&1; then + echo transpose_csvtool + else + if bash -c 'echo 1 | transpose_perl ,' >/dev/null 2>&1; then + echo transpose_perl + else + echo Error: You need the Perl Text::CSV module or csvtool. >&2 + fi + fi } -file_to_paste_files() { - # Run transpose_inner on blocks from $file - # output each block as file name - local block_size - local sep +make_paste_files() { + # Transpose input in blocks + # Output: + # each transposed block as file name block_size="$1" sep="$2" file="$3" - PARALLEL="-k --files --block $block_size" \ - parallel --pipe-part -a "$file" transpose_inner -d "'$sep'" + transposer=$(detect_transposer) + par_opt="-k --files --block $block_size" + if [ -z "$file" ]; then + parallel $par_opt --pipe "$transposer" "'$sep'" + else + parallel $par_opt --pipe-part -a "$file" "$transposer" "'$sep'" + fi } super_paste() { # Like 'paste' up to 1000000 files # More than 250000 files requires extra filehandles for GNU Parallel # The files are read from stdin - local sep - local paste_files - local fifo + cleanup() { + printf "\rSIGINT caught \n" >&2 + (rm -rf "$TMPDIR" &)& + exit 1 + } -other_commands() { - printf "\rSIGINT caught " - ls -l $paste_files - cat $paste_files | parallel "eval rm -f {} $fifo{0#}" - rm $paste_files -} - - trap 'other_commands' SIGINT + trap 'cleanup' SIGINT sep="$1" - paste_files=`tempfile` + paste_files=$(tempfile) # basename - fifo=`tempfile` - rm $fifo + fifo=$(tempfile) + rm "$fifo" # Group files from stdin in groups of 1000 files - parallel -k -n1000 echo > $paste_files + parallel -k -n1000 echo > "$paste_files" # Define replacement string {0#} to 0-pad job number export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10))); $_=sprintf("%0${f}d",seq())'\' # Make fifos that can be read from - cat $paste_files | parallel "rm -f $fifo{0#}; mkfifo $fifo{0#}" + cat "$paste_files" | parallel "rm -f '$fifo'{0#}; mkfifo '$fifo'{0#}" # Start a paste process for every 1000 files - cat $paste_files | parallel -j0 "eval paste -d \''$sep'\' {} > $fifo{0#}" & + cat "$paste_files" | parallel -j0 "eval paste -d \''$sep'\' {} > '$fifo'{0#}" & # Paste all the fifos - eval paste -d "'$sep'" $fifo* + eval paste -d "'$sep'" "$fifo"* # Cleanup - cat $paste_files | parallel "eval rm -f {} $fifo{0#}" - rm $paste_files + cat "$paste_files" | parallel "eval rm -f {} '$fifo'{0#}" + rm "$paste_files" } stdin_detect_sep() { - # Read the first 3 lines and detect the separator - # Save the read input to file - local file - file="$1" - # TODO - echo "$d" + # Read the first 3 lines of stdin and detect the separator + # Only , space tab ; | : \0 and whitespace are detected + # Save the 3 lines input to file so it can be read again later + perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA' +#!/usr/bin/perl + +sub max(@) { + # Returns: + # Maximum value of array + my $max; + for (@_) { + # Skip undefs + defined $_ or next; + defined $max or do { $max = $_; next; }; # Set $_ to the first non-undef + $max = ($max > $_) ? $max : $_; + } + return $max; +} + +sub find_sep(@) { + # Try common find the separators. + # Do we get the same for each line? + my @csv = grep { not /^#/ } @_; + my @sep = (",", "\t", ";", ' ', '\|', ':', "\0", '\s+'); + my $columns; + my %col; + for my $sep (@sep) { + for my $line (@csv) { + $columns = split /$sep/, $line; + if($columns > 1) { + $col{$sep."\0".$columns}++ + } + } + } + # Find max $col{$sep,$columns} + my $most_lines = max(values %col); + + my %sepcol = (map { split /\0/, $_ } + grep { $col{$_} == $most_lines } keys %col); + my $most_cols = max(values %sepcol); + return ((grep { $sepcol{$_} == $most_cols } keys %sepcol)[0]); +} + +my $buf = ""; +my $newlines = 0; +open(OUT, "+>", shift) || die; +# Copy (at least) 3 lines to OUT +while(sysread(STDIN,$buf,131072)) { + print OUT $buf; + $newlines += $buf =~ tr/\n/\n/; + if($newlines >= 3) { + last; + } +} +seek(OUT,0,0) || die; +my @lines = ; +close OUT; +# Remove last half-line +pop @lines; +print find_sep(@lines); + +cut-here-UbsAqi0j6GoOuk5W5yWA + ) "$@" +} + +matrix() { + # Generate table X by Y + row="$1" + col="$2" + sep="$3" + + mxn() { perl -E 'for(1..'$1') { say join "'$3'", map {int(rand()*123456)} 1..'$2' } '; } + export -f mxn + thr=$(parallel --number-of-threads) + prow=$((row/thr/100)) + seq $((thr*100)) | parallel mxn $prow $col $sep + mxn $((row-prow*thr*100)) $col $sep +} + +demo() { + # Generate table X by Y + row="$1" + col="$2" + sep="$3" + + # Generate string "- - - - " for each col + paste_format_string=$(perl -e 'print map {"- "} 1..'$col) + # Generate dummy values + while seq 123456; do true; done | + # Use paste's format string to make $col columns + paste -d "$sep" $paste_format_string | + # Keep the top $row rows + head -n "$row" } usage() { echo "Usage: $0 [-d delimiter] [-b blocksize]" 1>&2; exit 1; } +version() { + cat < +This is free software: you are free to change and redistribute it. +GNU parallel comes with no warranty. + +Web site: https://gitlab.com/ole.tange/tangetools/-/tree/master/transpose +EOF +} + main() { - block_size=10M - while getopts ":b:d:" o; do - case "${o}" in + block_size=100M + while getopts ":b:d:V" o; do + case "$o" in d) - d="$(printf "${OPTARG}")" - if [ "'" = "${d}" ] ; then + # Convert \t to TAB using printf + d=$(printf "$OPTARG") + if [ "'" = "$d" ] ; then echo "Delimiter cannot be '" usage - exit + exit 0 fi ;; b) - block_size="${OPTARG}" + block_size="$OPTARG" + ;; + V) + version + exit 0 ;; *) usage @@ -337,22 +524,46 @@ main() { esac done shift $((OPTIND-1)) - - if [ -z "${d}" ] ; then - d="$(printf "\t")" - fi - - file="$@" - first_lines=`tempfile` + file="$1" + sep="$d" + # Put all tempfiles into a single dir + export TMPDIR=`mktemp -d` + first_lines=$(tempfile) if [ -z "$file" ]; then - sep="$(stdin_detect_sep $first_lines)" - (cat $first_lines; rm $first_lines; cat) | - stdin_to_paste_files $block_size "$sep" | super_paste "$sep" + if [ -z "$sep" ] ; then + sep=$(stdin_detect_sep $first_lines) + if [ -z "$sep" ] ; then + echo "transpose: Cannot autodetect separator. Use -d" >&2 + exit 1 + fi + fi + if [ "$sep" == '\s+' ] ; then + # Multiple spaces = separator + osep=" " + else + osep="$sep" + fi + # Prepend stdin with the lines read in stdin_detect_sep + (cat "$first_lines"; rm "$first_lines"; cat) | + make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep" else - sep="$(stdin_detect_sep < "$file" $first_lines)" - rm $first_lines - file_to_paste_files $block_size "$sep" "$file" | super_paste "$sep" + if [ -z "$sep" ] ; then + sep=$(stdin_detect_sep < "$file" "$first_lines") + if [ -z "$sep" ] ; then + echo "transpose: Cannot autodetect separator. Use -d" >&2 + exit 1 + fi + fi + rm "$first_lines" + if [ "$sep" == '\s+' ] ; then + # Multiple spaces = separator + osep=" " + else + osep="$sep" + fi + make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep" fi + rmdir "$TMPDIR" 2>/dev/null } # Make sure the whole file is read before starting diff --git a/transpose/transpose-par.pl b/transpose/transpose-par.pl deleted file mode 100755 index 515704d..0000000 --- a/transpose/transpose-par.pl +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/perl - -use Text::CSV; -use File::Temp qw(tempfile tempdir); - -my $csv; -my (@table); -my $first_line = 1; -my $col = 0; -while(my $l = <>) { - if($first_line) { - my $csv_setting = guess_csv_setting($l); - $csv = Text::CSV->new($csv_setting) - or die "Cannot use CSV: ".Text::CSV->error_diag (); - $first_line = 0; - } - if(not $csv->parse($l)) { - die "CSV has unexpected format"; - } - # append to each row - my $row = 0; - - for($csv->fields()) { - $table[$row][$col] = defined($_) ? $_ : ''; - $row++; - } - $col++; -} - -print map { join("\t",@$_),"\n" } @table; - -sub guess_csv_setting { - # Based on two lines guess the csv_setting - my $line = shift; - # Potential field separators - # Priority: - # \0 if both lines have the same number - # \t if both lines have the same number - my @fieldsep = (",", "\t", "\0", ":", ";", "|", "/"); - my %count; - @count{@fieldsep} = (0,0,0,0,0,0); - # Count characters - map { $count{$_}++ } split //,$line; - my @sepsort = sort { $count{$b} <=> $count{$a} } @fieldsep; - my $guessed_sep; - if($count{"\0"} > 0) { - # \0 is in the line => this is definitely the field sep - $guessed_sep = "\0"; - } elsif($count{"\t"} > 0) { - # \t is in the line => this is definitely the field sep - $guessed_sep = "\t"; - } else { - $guessed_sep = $sepsort[0]; - } - return { binary => 1, sep_char => $guessed_sep }; -} - -sub _guess_csv_setting { - # Try different csv_settings - # Return a $csv object with the best setting - my @csv_file_types = - ( { binary => 1, sep_char => "\0" }, - { binary => 1, sep_char => "\t" }, - { binary => 1, sep_char => "," }, - { binary => 1 }, - ); - - my $succesful_csv_type; - my $csv; - for my $csv_file_type (@csv_file_types) { - $csv = Text::CSV->new ( $csv_file_type ) - or die "Cannot use CSV: ($csv_file_type) ".Text::CSV->error_diag (); - $succesful_csv_type = $csv_file_type; - my $last_n_fields; - for my $line (@lines) { - if($csv->parse($line)) { - my $n_fields = ($csv->fields()); - $last_fields ||= $n_fields; - - } else{ - $succesful_csv_type = 0; - last; - } - } - - } - if(not $succesful_csv_type) { - $csv->error_diag(); - } - - $csv = Text::CSV->new ( $succesful_csv_type ) # should set binary attribute. - or die "Cannot use CSV: ".Text::CSV->error_diag (); - return($csv); -}