transpose: Autofind separator.

This commit is contained in:
Ole Tange 2020-11-28 21:39:35 +01:00
parent 39f19757ae
commit 66be041e30
3 changed files with 301 additions and 185 deletions

View file

@ -1,6 +1,6 @@
CMD = blink 2grep 2search burncpu drac duplicate-packets em emoticons \ CMD = blink 2grep 2search burncpu drac duplicate-packets em emoticons \
encdir field find-first-fail forever fxkill G gitnext gitundo \ encdir field find-first-fail forever fxkill G gitnext gitundo \
goodpasswd histogram Loffice mtrr mirrorpdf neno off parsort \ goodpasswd histogram Loffice mtrr mirrorpdf neno not off \
pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean \ pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean \
rina rn rrm seekmaniac shython sound-reload splitvideo stdout \ rina rn rrm seekmaniac shython sound-reload splitvideo stdout \
swapout T teetime timestamp tracefile transpose upsidedown \ swapout T teetime timestamp tracefile transpose upsidedown \
@ -12,10 +12,9 @@ all: blink/blink.1 2search/2grep.1 2search/2search.1 \
find-first-fail/find-first-fail.1 G/G.1 gitnext/gitnext.1 \ find-first-fail/find-first-fail.1 G/G.1 gitnext/gitnext.1 \
gitundo/gitundo.1 goodpasswd/goodpasswd.1 \ gitundo/gitundo.1 goodpasswd/goodpasswd.1 \
histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \ histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \
off/off.1 parsort/parsort.1 pdfman/pdfman.1 pidcmd/pidcmd.1 \ off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1 \
pidtree/pidtree.1 plotpipe/plotpipe.1 puniq/puniq.1 \ plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1 \
rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 \ rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1 \
seekmaniac/seekmaniac.1 shython/shython.1 \
sound-reload/sound-reload.1 splitvideo/splitvideo.1 \ sound-reload/sound-reload.1 splitvideo/splitvideo.1 \
stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1 \ stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1 \
tracefile/tracefile.1 transpose/transpose.1 T/T.1 \ tracefile/tracefile.1 transpose/transpose.1 T/T.1 \

View file

@ -1,7 +1,8 @@
#!/bin/bash #!/bin/bash
: <<=cut : <<'_EOS'
=pod =pod
=cut
=head1 NAME =head1 NAME
@ -9,30 +10,40 @@ transpose - transpose CSV file
=head1 SYNOPSIS =head1 SYNOPSIS
B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input>] ... | B<transpose> [-d I<delim>] [-b I<blocksize>]
B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input.csv>]
=head1 DESCRIPTION =head1 DESCRIPTION
B<transpose> will read a CSV fie B<transpose> will read a CSV file and write the transposed version of
the file. I.e. rows will be columns, and columns will be rows.
=head1 OPTIONS =head1 OPTIONS
=over 9 =over 9
=item I<input> =item I<input.csv>
Input CSV file. If none is given reads from STDIN (standard input). Input CSV file. If none is given B<transpose> reads from STDIN
(standard input).
=item B<-d> I<delim> =item B<-d> I<delim>
Use I<delim> as delimiter in input and output. Use I<delim> as delimiter in input and output. If no delimiter is
given, B<transpose> will read the first 3 rows and try to guess the
delimiter.
The autodetection does not work well if values contain a quoted
delimiter: E.g. a,"value with quoted ,",other value
=item B<-b> I<blocksize> =item B<-b> I<blocksize>
Pass chunks of I<blocksize> bytes to the internal transposer. Memory Pass chunks of I<blocksize> bytes to the internal transposer. Memory
usage will be 10 times I<blocksiz> per CPU core. Default is 10M. usage will be 10 times I<blocksize> per CPU core. Default is 100M.
=back =back
@ -40,18 +51,60 @@ usage will be 10 times I<blocksiz> per CPU core. Default is 10M.
=head1 EXAMPLES =head1 EXAMPLES
=head2 EXAMPLE: Transpose a medium sized TSV file =head2 EXAMPLE: Transpose a TSV file
cat normal.tsv | transpose -d '\t' > transposed.tsv
=head2 EXAMPLE: transpose 57 GB 100000x100000 CSV file
# Generate 100000x100000 matrix
100kx100k() {
100000x() {
while seq 123456 | shuf; do true; done |
paste $(perl -e 'print map {"- "} 1..100000') |
head -n $1
}
export -f 100000x
seq 1000 | parallel --nice 18 --delay 0.05 --files 100000x 100 |
parallel -uj1 'cat {}; nice rm {} &'
}
100kx100k > 100kx100k
# Transpose it
transpose 100kx100k > 100kx100k.t
This takes around 700 MB/core and 20 minutes to run on 64C64T.
=head1 LIMITATIONS
B<transpose> is tested on a 1000000x1000000 3.6 TB table.
There is a limit on how many filehandles can be used in super_paste.
This is probably in the order of 1000000. This limits is the number of
temporary files. By increasing the block size the number of temporary
files will be lowered. The 3.6 TB test resulted in 36000 files, so if
the limit is 1000000 files, it should work fine up to 100 TB before
you need to increase the block size.
cat medium.tsv | transpose -d '\t' > muidem.tsv
=head1 DESIGN =head1 DESIGN
B<transpose> is designed to deal efficiently with medium sized data B<transpose> is designed to deal efficiently with medium sized data
(up to 30 TB per file) on systems with 100 MB RAM per CPU core. It (up to 30 TB files) on systems with 2 GB RAM per CPU core. It works by
works by chopping the input into 10 MB blocks. Each block is chopping the input into blocks (default: 100 MB). Each block is
transposed in parallel and saved to disk. Then these files are pasted transposed in parallel and saved to disk. Then these files are pasted
together and finally removed. together and finally removed.
B<transpose> uses B<csvtool> if installed and a (slower) perl script
otherwise.
=head1 BUGS
B<transpose> makes files in $TMPDIR (default: /tmp). These are not
cleaned up, if B<transpose> is stopped abnormally (e.g. killed).
=head1 REPORTING BUGS =head1 REPORTING BUGS
Report bugs to <tange@gnu.org>. Report bugs to <tange@gnu.org>.
@ -59,7 +112,7 @@ Report bugs to <tange@gnu.org>.
=head1 AUTHOR =head1 AUTHOR
Copyright (C) 2013-2018 Ole Tange, http://ole.tange.dk and Free Copyright (C) 2013-2020 Ole Tange, http://ole.tange.dk and Free
Software Foundation, Inc. Software Foundation, Inc.
@ -178,22 +231,34 @@ B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
=head1 SEE ALSO =head1 SEE ALSO
B<bash>(1), B<parallel>(1), B<paste>(1) B<csvtool>(1), B<bash>(1), B<parallel>(1), B<paste>(1)
=cut =cut
_EOS
#'
# Timings: 100kx200k (114GB) 34min
# 200kx200k (228GB) 63min
# transpose [-d delimiter] [-b blocksize] table.csv > transposed.csv transpose_perl() {
# cat table.csv | transpose [-d delimiter] [-b blocksize] > transposed.csv # Simple in-memory transpose
# Standard input:
transpose_inner() {
# simple in-memory transpose
# -d sep
# Input:
# data to be transposed # data to be transposed
# Output: # Standard output:
# transposed data # transposed data
perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA' sep="$1"
shift
if [ "$sep" == '\s+' ] ; then
# Multiple spaces = separator
space_merger() { tr ' ' ' ' | tr -s ' '; }
sep=" "
else
space_merger() { cat; }
fi
space_merger | perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
#!/usr/bin/perl
use Text::CSV; use Text::CSV;
use Getopt::Long; use Getopt::Long;
@ -233,103 +298,225 @@ sub simple {
print map { join($sep,@$_),"\n" } @table; print map { join($sep,@$_),"\n" } @table;
} }
cut-here-UbsAqi0j6GoOuk5W5yWA cut-here-UbsAqi0j6GoOuk5W5yWA
) "$@" ) -d "$sep" "$@"
} }
export -f transpose_inner export -f transpose_perl
stdin_to_paste_files() { transpose_csvtool() {
# Run transpose_inner on blocks from stdin # Use cvstool to transpose
# output each block as file name # Standard input:
local block_size # data to be transposed
local sep # Standard output:
block_size="$1" # transposed data
sep="$2" sep="$1"
PARALLEL="-k --files --block $block_size" \ if [ "$sep" == "\s+" ] ; then
parallel --pipe transpose_inner -d "'$sep'" # Multiple spaces = separator
tr ' ' ' ' | tr -s ' ' |
csvtool transpose -t " " -u " " -
else
csvtool transpose -t "$sep" -u "$sep" -
fi
}
export -f transpose_csvtool
detect_transposer() {
# Find the fastest transpose tool installed
if bash -c 'echo 1 | transpose_csvtool ,' >/dev/null 2>&1; then
echo transpose_csvtool
else
if bash -c 'echo 1 | transpose_perl ,' >/dev/null 2>&1; then
echo transpose_perl
else
echo Error: You need the Perl Text::CSV module or csvtool. >&2
fi
fi
} }
file_to_paste_files() { make_paste_files() {
# Run transpose_inner on blocks from $file # Transpose input in blocks
# output each block as file name # Output:
local block_size # each transposed block as file name
local sep
block_size="$1" block_size="$1"
sep="$2" sep="$2"
file="$3" file="$3"
PARALLEL="-k --files --block $block_size" \ transposer=$(detect_transposer)
parallel --pipe-part -a "$file" transpose_inner -d "'$sep'" par_opt="-k --files --block $block_size"
if [ -z "$file" ]; then
parallel $par_opt --pipe "$transposer" "'$sep'"
else
parallel $par_opt --pipe-part -a "$file" "$transposer" "'$sep'"
fi
} }
super_paste() { super_paste() {
# Like 'paste' up to 1000000 files # Like 'paste' up to 1000000 files
# More than 250000 files requires extra filehandles for GNU Parallel # More than 250000 files requires extra filehandles for GNU Parallel
# The files are read from stdin # The files are read from stdin
local sep cleanup() {
local paste_files printf "\rSIGINT caught \n" >&2
local fifo (rm -rf "$TMPDIR" &)&
exit 1
}
other_commands() { trap 'cleanup' SIGINT
printf "\rSIGINT caught "
ls -l $paste_files
cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
rm $paste_files
}
trap 'other_commands' SIGINT
sep="$1" sep="$1"
paste_files=`tempfile` paste_files=$(tempfile)
# basename # basename
fifo=`tempfile` fifo=$(tempfile)
rm $fifo rm "$fifo"
# Group files from stdin in groups of 1000 files # Group files from stdin in groups of 1000 files
parallel -k -n1000 echo > $paste_files parallel -k -n1000 echo > "$paste_files"
# Define replacement string {0#} to 0-pad job number # Define replacement string {0#} to 0-pad job number
export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10))); export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
$_=sprintf("%0${f}d",seq())'\' $_=sprintf("%0${f}d",seq())'\'
# Make fifos that can be read from # Make fifos that can be read from
cat $paste_files | parallel "rm -f $fifo{0#}; mkfifo $fifo{0#}" cat "$paste_files" | parallel "rm -f '$fifo'{0#}; mkfifo '$fifo'{0#}"
# Start a paste process for every 1000 files # Start a paste process for every 1000 files
cat $paste_files | parallel -j0 "eval paste -d \''$sep'\' {} > $fifo{0#}" & cat "$paste_files" | parallel -j0 "eval paste -d \''$sep'\' {} > '$fifo'{0#}" &
# Paste all the fifos # Paste all the fifos
eval paste -d "'$sep'" $fifo* eval paste -d "'$sep'" "$fifo"*
# Cleanup # Cleanup
cat $paste_files | parallel "eval rm -f {} $fifo{0#}" cat "$paste_files" | parallel "eval rm -f {} '$fifo'{0#}"
rm $paste_files rm "$paste_files"
} }
stdin_detect_sep() { stdin_detect_sep() {
# Read the first 3 lines and detect the separator # Read the first 3 lines of stdin and detect the separator
# Save the read input to file # Only , space tab ; | : \0 and whitespace are detected
local file # Save the 3 lines input to file so it can be read again later
file="$1" perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
# TODO #!/usr/bin/perl
echo "$d"
sub max(@) {
# Returns:
# Maximum value of array
my $max;
for (@_) {
# Skip undefs
defined $_ or next;
defined $max or do { $max = $_; next; }; # Set $_ to the first non-undef
$max = ($max > $_) ? $max : $_;
}
return $max;
}
sub find_sep(@) {
# Try common find the separators.
# Do we get the same for each line?
my @csv = grep { not /^#/ } @_;
my @sep = (",", "\t", ";", ' ', '\|', ':', "\0", '\s+');
my $columns;
my %col;
for my $sep (@sep) {
for my $line (@csv) {
$columns = split /$sep/, $line;
if($columns > 1) {
$col{$sep."\0".$columns}++
}
}
}
# Find max $col{$sep,$columns}
my $most_lines = max(values %col);
my %sepcol = (map { split /\0/, $_ }
grep { $col{$_} == $most_lines } keys %col);
my $most_cols = max(values %sepcol);
return ((grep { $sepcol{$_} == $most_cols } keys %sepcol)[0]);
}
my $buf = "";
my $newlines = 0;
open(OUT, "+>", shift) || die;
# Copy (at least) 3 lines to OUT
while(sysread(STDIN,$buf,131072)) {
print OUT $buf;
$newlines += $buf =~ tr/\n/\n/;
if($newlines >= 3) {
last;
}
}
seek(OUT,0,0) || die;
my @lines = <OUT>;
close OUT;
# Remove last half-line
pop @lines;
print find_sep(@lines);
cut-here-UbsAqi0j6GoOuk5W5yWA
) "$@"
}
matrix() {
# Generate table X by Y
row="$1"
col="$2"
sep="$3"
mxn() { perl -E 'for(1..'$1') { say join "'$3'", map {int(rand()*123456)} 1..'$2' } '; }
export -f mxn
thr=$(parallel --number-of-threads)
prow=$((row/thr/100))
seq $((thr*100)) | parallel mxn $prow $col $sep
mxn $((row-prow*thr*100)) $col $sep
}
demo() {
# Generate table X by Y
row="$1"
col="$2"
sep="$3"
# Generate string "- - - - " for each col
paste_format_string=$(perl -e 'print map {"- "} 1..'$col)
# Generate dummy values
while seq 123456; do true; done |
# Use paste's format string to make $col columns
paste -d "$sep" $paste_format_string |
# Keep the top $row rows
head -n "$row"
} }
usage() { usage() {
echo "Usage: $0 [-d delimiter] [-b blocksize]" 1>&2; exit 1; echo "Usage: $0 [-d delimiter] [-b blocksize]" 1>&2; exit 1;
} }
version() {
cat <<EOF
transpose 20201130
Copyright (C) 2020 Ole Tange, http://ole.tange.dk
License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
GNU parallel comes with no warranty.
Web site: https://gitlab.com/ole.tange/tangetools/-/tree/master/transpose
EOF
}
main() { main() {
block_size=10M block_size=100M
while getopts ":b:d:" o; do while getopts ":b:d:V" o; do
case "${o}" in case "$o" in
d) d)
d="$(printf "${OPTARG}")" # Convert \t to TAB using printf
if [ "'" = "${d}" ] ; then d=$(printf "$OPTARG")
if [ "'" = "$d" ] ; then
echo "Delimiter cannot be '" echo "Delimiter cannot be '"
usage usage
exit exit 0
fi fi
;; ;;
b) b)
block_size="${OPTARG}" block_size="$OPTARG"
;;
V)
version
exit 0
;; ;;
*) *)
usage usage
@ -337,22 +524,46 @@ main() {
esac esac
done done
shift $((OPTIND-1)) shift $((OPTIND-1))
file="$1"
if [ -z "${d}" ] ; then sep="$d"
d="$(printf "\t")" # Put all tempfiles into a single dir
fi export TMPDIR=`mktemp -d`
first_lines=$(tempfile)
file="$@"
first_lines=`tempfile`
if [ -z "$file" ]; then if [ -z "$file" ]; then
sep="$(stdin_detect_sep $first_lines)" if [ -z "$sep" ] ; then
(cat $first_lines; rm $first_lines; cat) | sep=$(stdin_detect_sep $first_lines)
stdin_to_paste_files $block_size "$sep" | super_paste "$sep" if [ -z "$sep" ] ; then
else echo "transpose: Cannot autodetect separator. Use -d" >&2
sep="$(stdin_detect_sep < "$file" $first_lines)" exit 1
rm $first_lines
file_to_paste_files $block_size "$sep" "$file" | super_paste "$sep"
fi fi
fi
if [ "$sep" == '\s+' ] ; then
# Multiple spaces = separator
osep=" "
else
osep="$sep"
fi
# Prepend stdin with the lines read in stdin_detect_sep
(cat "$first_lines"; rm "$first_lines"; cat) |
make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
else
if [ -z "$sep" ] ; then
sep=$(stdin_detect_sep < "$file" "$first_lines")
if [ -z "$sep" ] ; then
echo "transpose: Cannot autodetect separator. Use -d" >&2
exit 1
fi
fi
rm "$first_lines"
if [ "$sep" == '\s+' ] ; then
# Multiple spaces = separator
osep=" "
else
osep="$sep"
fi
make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
fi
rmdir "$TMPDIR" 2>/dev/null
} }
# Make sure the whole file is read before starting # Make sure the whole file is read before starting

View file

@ -1,94 +0,0 @@
#!/usr/bin/perl
use Text::CSV;
use File::Temp qw(tempfile tempdir);
my $csv;
my (@table);
my $first_line = 1;
my $col = 0;
while(my $l = <>) {
if($first_line) {
my $csv_setting = guess_csv_setting($l);
$csv = Text::CSV->new($csv_setting)
or die "Cannot use CSV: ".Text::CSV->error_diag ();
$first_line = 0;
}
if(not $csv->parse($l)) {
die "CSV has unexpected format";
}
# append to each row
my $row = 0;
for($csv->fields()) {
$table[$row][$col] = defined($_) ? $_ : '';
$row++;
}
$col++;
}
print map { join("\t",@$_),"\n" } @table;
sub guess_csv_setting {
# Based on two lines guess the csv_setting
my $line = shift;
# Potential field separators
# Priority:
# \0 if both lines have the same number
# \t if both lines have the same number
my @fieldsep = (",", "\t", "\0", ":", ";", "|", "/");
my %count;
@count{@fieldsep} = (0,0,0,0,0,0);
# Count characters
map { $count{$_}++ } split //,$line;
my @sepsort = sort { $count{$b} <=> $count{$a} } @fieldsep;
my $guessed_sep;
if($count{"\0"} > 0) {
# \0 is in the line => this is definitely the field sep
$guessed_sep = "\0";
} elsif($count{"\t"} > 0) {
# \t is in the line => this is definitely the field sep
$guessed_sep = "\t";
} else {
$guessed_sep = $sepsort[0];
}
return { binary => 1, sep_char => $guessed_sep };
}
sub _guess_csv_setting {
# Try different csv_settings
# Return a $csv object with the best setting
my @csv_file_types =
( { binary => 1, sep_char => "\0" },
{ binary => 1, sep_char => "\t" },
{ binary => 1, sep_char => "," },
{ binary => 1 },
);
my $succesful_csv_type;
my $csv;
for my $csv_file_type (@csv_file_types) {
$csv = Text::CSV->new ( $csv_file_type )
or die "Cannot use CSV: ($csv_file_type) ".Text::CSV->error_diag ();
$succesful_csv_type = $csv_file_type;
my $last_n_fields;
for my $line (@lines) {
if($csv->parse($line)) {
my $n_fields = ($csv->fields());
$last_fields ||= $n_fields;
} else{
$succesful_csv_type = 0;
last;
}
}
}
if(not $succesful_csv_type) {
$csv->error_diag();
}
$csv = Text::CSV->new ( $succesful_csv_type ) # should set binary attribute.
or die "Cannot use CSV: ".Text::CSV->error_diag ();
return($csv);
}