transpose: Autofind separator.

This commit is contained in:
Ole Tange 2020-11-28 21:39:35 +01:00
parent 39f19757ae
commit 66be041e30
3 changed files with 301 additions and 185 deletions

View file

@ -1,6 +1,6 @@
CMD = blink 2grep 2search burncpu drac duplicate-packets em emoticons \
encdir field find-first-fail forever fxkill G gitnext gitundo \
goodpasswd histogram Loffice mtrr mirrorpdf neno off parsort \
goodpasswd histogram Loffice mtrr mirrorpdf neno not off \
pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean \
rina rn rrm seekmaniac shython sound-reload splitvideo stdout \
swapout T teetime timestamp tracefile transpose upsidedown \
@ -12,10 +12,9 @@ all: blink/blink.1 2search/2grep.1 2search/2search.1 \
find-first-fail/find-first-fail.1 G/G.1 gitnext/gitnext.1 \
gitundo/gitundo.1 goodpasswd/goodpasswd.1 \
histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \
off/off.1 parsort/parsort.1 pdfman/pdfman.1 pidcmd/pidcmd.1 \
pidtree/pidtree.1 plotpipe/plotpipe.1 puniq/puniq.1 \
rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 \
seekmaniac/seekmaniac.1 shython/shython.1 \
off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1 \
plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1 \
rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1 \
sound-reload/sound-reload.1 splitvideo/splitvideo.1 \
stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1 \
tracefile/tracefile.1 transpose/transpose.1 T/T.1 \

View file

@ -1,7 +1,8 @@
#!/bin/bash
: <<=cut
: <<'_EOS'
=pod
=cut
=head1 NAME
@ -9,30 +10,40 @@ transpose - transpose CSV file
=head1 SYNOPSIS
B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input>]
... | B<transpose> [-d I<delim>] [-b I<blocksize>]
B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input.csv>]
=head1 DESCRIPTION
B<transpose> will read a CSV fie
B<transpose> will read a CSV file and write the transposed version of
the file. I.e. rows will be columns, and columns will be rows.
=head1 OPTIONS
=over 9
=item I<input>
=item I<input.csv>
Input CSV file. If none is given reads from STDIN (standard input).
Input CSV file. If none is given B<transpose> reads from STDIN
(standard input).
=item B<-d> I<delim>
Use I<delim> as delimiter in input and output.
Use I<delim> as delimiter in input and output. If no delimiter is
given, B<transpose> will read the first 3 rows and try to guess the
delimiter.
The autodetection does not work well if values contain a quoted
delimiter: E.g. a,"value with quoted ,",other value
=item B<-b> I<blocksize>
Pass chunks of I<blocksize> bytes to the internal transposer. Memory
usage will be 10 times I<blocksiz> per CPU core. Default is 10M.
usage will be 10 times I<blocksize> per CPU core. Default is 100M.
=back
@ -40,18 +51,60 @@ usage will be 10 times I<blocksiz> per CPU core. Default is 10M.
=head1 EXAMPLES
=head2 EXAMPLE: Transpose a medium sized TSV file
=head2 EXAMPLE: Transpose a TSV file
cat normal.tsv | transpose -d '\t' > transposed.tsv
=head2 EXAMPLE: transpose 57 GB 100000x100000 CSV file
# Generate 100000x100000 matrix
100kx100k() {
100000x() {
while seq 123456 | shuf; do true; done |
paste $(perl -e 'print map {"- "} 1..100000') |
head -n $1
}
export -f 100000x
seq 1000 | parallel --nice 18 --delay 0.05 --files 100000x 100 |
parallel -uj1 'cat {}; nice rm {} &'
}
100kx100k > 100kx100k
# Transpose it
transpose 100kx100k > 100kx100k.t
This takes around 700 MB/core and 20 minutes to run on 64C64T.
=head1 LIMITATIONS
B<transpose> is tested on a 1000000x1000000 3.6 TB table.
There is a limit on how many filehandles can be used in super_paste.
This is probably in the order of 1000000. This limits is the number of
temporary files. By increasing the block size the number of temporary
files will be lowered. The 3.6 TB test resulted in 36000 files, so if
the limit is 1000000 files, it should work fine up to 100 TB before
you need to increase the block size.
cat medium.tsv | transpose -d '\t' > muidem.tsv
=head1 DESIGN
B<transpose> is designed to deal efficiently with medium sized data
(up to 30 TB per file) on systems with 100 MB RAM per CPU core. It
works by chopping the input into 10 MB blocks. Each block is
(up to 30 TB files) on systems with 2 GB RAM per CPU core. It works by
chopping the input into blocks (default: 100 MB). Each block is
transposed in parallel and saved to disk. Then these files are pasted
together and finally removed.
B<transpose> uses B<csvtool> if installed and a (slower) perl script
otherwise.
=head1 BUGS
B<transpose> makes files in $TMPDIR (default: /tmp). These are not
cleaned up, if B<transpose> is stopped abnormally (e.g. killed).
=head1 REPORTING BUGS
Report bugs to <tange@gnu.org>.
@ -59,7 +112,7 @@ Report bugs to <tange@gnu.org>.
=head1 AUTHOR
Copyright (C) 2013-2018 Ole Tange, http://ole.tange.dk and Free
Copyright (C) 2013-2020 Ole Tange, http://ole.tange.dk and Free
Software Foundation, Inc.
@ -178,22 +231,34 @@ B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
=head1 SEE ALSO
B<bash>(1), B<parallel>(1), B<paste>(1)
B<csvtool>(1), B<bash>(1), B<parallel>(1), B<paste>(1)
=cut
_EOS
#'
# Timings: 100kx200k (114GB) 34min
# 200kx200k (228GB) 63min
# transpose [-d delimiter] [-b blocksize] table.csv > transposed.csv
# cat table.csv | transpose [-d delimiter] [-b blocksize] > transposed.csv
transpose_inner() {
# simple in-memory transpose
# -d sep
# Input:
transpose_perl() {
# Simple in-memory transpose
# Standard input:
# data to be transposed
# Output:
# Standard output:
# transposed data
perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
sep="$1"
shift
if [ "$sep" == '\s+' ] ; then
# Multiple spaces = separator
space_merger() { tr ' ' ' ' | tr -s ' '; }
sep=" "
else
space_merger() { cat; }
fi
space_merger | perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
#!/usr/bin/perl
use Text::CSV;
use Getopt::Long;
@ -233,103 +298,225 @@ sub simple {
print map { join($sep,@$_),"\n" } @table;
}
cut-here-UbsAqi0j6GoOuk5W5yWA
) "$@"
) -d "$sep" "$@"
}
export -f transpose_inner
export -f transpose_perl
stdin_to_paste_files() {
# Run transpose_inner on blocks from stdin
# output each block as file name
local block_size
local sep
block_size="$1"
sep="$2"
PARALLEL="-k --files --block $block_size" \
parallel --pipe transpose_inner -d "'$sep'"
transpose_csvtool() {
# Use cvstool to transpose
# Standard input:
# data to be transposed
# Standard output:
# transposed data
sep="$1"
if [ "$sep" == "\s+" ] ; then
# Multiple spaces = separator
tr ' ' ' ' | tr -s ' ' |
csvtool transpose -t " " -u " " -
else
csvtool transpose -t "$sep" -u "$sep" -
fi
}
export -f transpose_csvtool
detect_transposer() {
# Find the fastest transpose tool installed
if bash -c 'echo 1 | transpose_csvtool ,' >/dev/null 2>&1; then
echo transpose_csvtool
else
if bash -c 'echo 1 | transpose_perl ,' >/dev/null 2>&1; then
echo transpose_perl
else
echo Error: You need the Perl Text::CSV module or csvtool. >&2
fi
fi
}
file_to_paste_files() {
# Run transpose_inner on blocks from $file
# output each block as file name
local block_size
local sep
make_paste_files() {
# Transpose input in blocks
# Output:
# each transposed block as file name
block_size="$1"
sep="$2"
file="$3"
PARALLEL="-k --files --block $block_size" \
parallel --pipe-part -a "$file" transpose_inner -d "'$sep'"
transposer=$(detect_transposer)
par_opt="-k --files --block $block_size"
if [ -z "$file" ]; then
parallel $par_opt --pipe "$transposer" "'$sep'"
else
parallel $par_opt --pipe-part -a "$file" "$transposer" "'$sep'"
fi
}
super_paste() {
# Like 'paste' up to 1000000 files
# More than 250000 files requires extra filehandles for GNU Parallel
# The files are read from stdin
local sep
local paste_files
local fifo
other_commands() {
printf "\rSIGINT caught "
ls -l $paste_files
cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
rm $paste_files
cleanup() {
printf "\rSIGINT caught \n" >&2
(rm -rf "$TMPDIR" &)&
exit 1
}
trap 'other_commands' SIGINT
trap 'cleanup' SIGINT
sep="$1"
paste_files=`tempfile`
paste_files=$(tempfile)
# basename
fifo=`tempfile`
rm $fifo
fifo=$(tempfile)
rm "$fifo"
# Group files from stdin in groups of 1000 files
parallel -k -n1000 echo > $paste_files
parallel -k -n1000 echo > "$paste_files"
# Define replacement string {0#} to 0-pad job number
export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
$_=sprintf("%0${f}d",seq())'\'
# Make fifos that can be read from
cat $paste_files | parallel "rm -f $fifo{0#}; mkfifo $fifo{0#}"
cat "$paste_files" | parallel "rm -f '$fifo'{0#}; mkfifo '$fifo'{0#}"
# Start a paste process for every 1000 files
cat $paste_files | parallel -j0 "eval paste -d \''$sep'\' {} > $fifo{0#}" &
cat "$paste_files" | parallel -j0 "eval paste -d \''$sep'\' {} > '$fifo'{0#}" &
# Paste all the fifos
eval paste -d "'$sep'" $fifo*
eval paste -d "'$sep'" "$fifo"*
# Cleanup
cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
rm $paste_files
cat "$paste_files" | parallel "eval rm -f {} '$fifo'{0#}"
rm "$paste_files"
}
stdin_detect_sep() {
# Read the first 3 lines and detect the separator
# Save the read input to file
local file
file="$1"
# TODO
echo "$d"
# Read the first 3 lines of stdin and detect the separator
# Only , space tab ; | : \0 and whitespace are detected
# Save the 3 lines input to file so it can be read again later
perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
#!/usr/bin/perl
sub max(@) {
# Returns:
# Maximum value of array
my $max;
for (@_) {
# Skip undefs
defined $_ or next;
defined $max or do { $max = $_; next; }; # Set $_ to the first non-undef
$max = ($max > $_) ? $max : $_;
}
return $max;
}
sub find_sep(@) {
# Try common find the separators.
# Do we get the same for each line?
my @csv = grep { not /^#/ } @_;
my @sep = (",", "\t", ";", ' ', '\|', ':', "\0", '\s+');
my $columns;
my %col;
for my $sep (@sep) {
for my $line (@csv) {
$columns = split /$sep/, $line;
if($columns > 1) {
$col{$sep."\0".$columns}++
}
}
}
# Find max $col{$sep,$columns}
my $most_lines = max(values %col);
my %sepcol = (map { split /\0/, $_ }
grep { $col{$_} == $most_lines } keys %col);
my $most_cols = max(values %sepcol);
return ((grep { $sepcol{$_} == $most_cols } keys %sepcol)[0]);
}
my $buf = "";
my $newlines = 0;
open(OUT, "+>", shift) || die;
# Copy (at least) 3 lines to OUT
while(sysread(STDIN,$buf,131072)) {
print OUT $buf;
$newlines += $buf =~ tr/\n/\n/;
if($newlines >= 3) {
last;
}
}
seek(OUT,0,0) || die;
my @lines = <OUT>;
close OUT;
# Remove last half-line
pop @lines;
print find_sep(@lines);
cut-here-UbsAqi0j6GoOuk5W5yWA
) "$@"
}
matrix() {
# Generate table X by Y
row="$1"
col="$2"
sep="$3"
mxn() { perl -E 'for(1..'$1') { say join "'$3'", map {int(rand()*123456)} 1..'$2' } '; }
export -f mxn
thr=$(parallel --number-of-threads)
prow=$((row/thr/100))
seq $((thr*100)) | parallel mxn $prow $col $sep
mxn $((row-prow*thr*100)) $col $sep
}
demo() {
# Generate table X by Y
row="$1"
col="$2"
sep="$3"
# Generate string "- - - - " for each col
paste_format_string=$(perl -e 'print map {"- "} 1..'$col)
# Generate dummy values
while seq 123456; do true; done |
# Use paste's format string to make $col columns
paste -d "$sep" $paste_format_string |
# Keep the top $row rows
head -n "$row"
}
usage() {
echo "Usage: $0 [-d delimiter] [-b blocksize]" 1>&2; exit 1;
}
version() {
cat <<EOF
transpose 20201130
Copyright (C) 2020 Ole Tange, http://ole.tange.dk
License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
GNU parallel comes with no warranty.
Web site: https://gitlab.com/ole.tange/tangetools/-/tree/master/transpose
EOF
}
main() {
block_size=10M
while getopts ":b:d:" o; do
case "${o}" in
block_size=100M
while getopts ":b:d:V" o; do
case "$o" in
d)
d="$(printf "${OPTARG}")"
if [ "'" = "${d}" ] ; then
# Convert \t to TAB using printf
d=$(printf "$OPTARG")
if [ "'" = "$d" ] ; then
echo "Delimiter cannot be '"
usage
exit
exit 0
fi
;;
b)
block_size="${OPTARG}"
block_size="$OPTARG"
;;
V)
version
exit 0
;;
*)
usage
@ -337,22 +524,46 @@ main() {
esac
done
shift $((OPTIND-1))
if [ -z "${d}" ] ; then
d="$(printf "\t")"
fi
file="$@"
first_lines=`tempfile`
file="$1"
sep="$d"
# Put all tempfiles into a single dir
export TMPDIR=`mktemp -d`
first_lines=$(tempfile)
if [ -z "$file" ]; then
sep="$(stdin_detect_sep $first_lines)"
(cat $first_lines; rm $first_lines; cat) |
stdin_to_paste_files $block_size "$sep" | super_paste "$sep"
else
sep="$(stdin_detect_sep < "$file" $first_lines)"
rm $first_lines
file_to_paste_files $block_size "$sep" "$file" | super_paste "$sep"
if [ -z "$sep" ] ; then
sep=$(stdin_detect_sep $first_lines)
if [ -z "$sep" ] ; then
echo "transpose: Cannot autodetect separator. Use -d" >&2
exit 1
fi
fi
if [ "$sep" == '\s+' ] ; then
# Multiple spaces = separator
osep=" "
else
osep="$sep"
fi
# Prepend stdin with the lines read in stdin_detect_sep
(cat "$first_lines"; rm "$first_lines"; cat) |
make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
else
if [ -z "$sep" ] ; then
sep=$(stdin_detect_sep < "$file" "$first_lines")
if [ -z "$sep" ] ; then
echo "transpose: Cannot autodetect separator. Use -d" >&2
exit 1
fi
fi
rm "$first_lines"
if [ "$sep" == '\s+' ] ; then
# Multiple spaces = separator
osep=" "
else
osep="$sep"
fi
make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
fi
rmdir "$TMPDIR" 2>/dev/null
}
# Make sure the whole file is read before starting

View file

@ -1,94 +0,0 @@
#!/usr/bin/perl
use Text::CSV;
use File::Temp qw(tempfile tempdir);
my $csv;
my (@table);
my $first_line = 1;
my $col = 0;
while(my $l = <>) {
if($first_line) {
my $csv_setting = guess_csv_setting($l);
$csv = Text::CSV->new($csv_setting)
or die "Cannot use CSV: ".Text::CSV->error_diag ();
$first_line = 0;
}
if(not $csv->parse($l)) {
die "CSV has unexpected format";
}
# append to each row
my $row = 0;
for($csv->fields()) {
$table[$row][$col] = defined($_) ? $_ : '';
$row++;
}
$col++;
}
print map { join("\t",@$_),"\n" } @table;
sub guess_csv_setting {
# Based on two lines guess the csv_setting
my $line = shift;
# Potential field separators
# Priority:
# \0 if both lines have the same number
# \t if both lines have the same number
my @fieldsep = (",", "\t", "\0", ":", ";", "|", "/");
my %count;
@count{@fieldsep} = (0,0,0,0,0,0);
# Count characters
map { $count{$_}++ } split //,$line;
my @sepsort = sort { $count{$b} <=> $count{$a} } @fieldsep;
my $guessed_sep;
if($count{"\0"} > 0) {
# \0 is in the line => this is definitely the field sep
$guessed_sep = "\0";
} elsif($count{"\t"} > 0) {
# \t is in the line => this is definitely the field sep
$guessed_sep = "\t";
} else {
$guessed_sep = $sepsort[0];
}
return { binary => 1, sep_char => $guessed_sep };
}
sub _guess_csv_setting {
# Try different csv_settings
# Return a $csv object with the best setting
my @csv_file_types =
( { binary => 1, sep_char => "\0" },
{ binary => 1, sep_char => "\t" },
{ binary => 1, sep_char => "," },
{ binary => 1 },
);
my $succesful_csv_type;
my $csv;
for my $csv_file_type (@csv_file_types) {
$csv = Text::CSV->new ( $csv_file_type )
or die "Cannot use CSV: ($csv_file_type) ".Text::CSV->error_diag ();
$succesful_csv_type = $csv_file_type;
my $last_n_fields;
for my $line (@lines) {
if($csv->parse($line)) {
my $n_fields = ($csv->fields());
$last_fields ||= $n_fields;
} else{
$succesful_csv_type = 0;
last;
}
}
}
if(not $succesful_csv_type) {
$csv->error_diag();
}
$csv = Text::CSV->new ( $succesful_csv_type ) # should set binary attribute.
or die "Cannot use CSV: ".Text::CSV->error_diag ();
return($csv);
}