tangetools/transpose/transpose
Ole Tange f6a34e1200 transpose: parallelized.
w4it-for-port-open: -q (quiet) implemented.
2018-03-27 02:56:54 +02:00

161 lines
3.6 KiB
Bash
Executable file

#!/bin/bash
# transpose [-d delimiter] [-b blocksize] table.csv > transposed.csv
# cat table.csv | transpose [-d delimiter] [-b blocksize] > transposed.csv
transpose_inner() {
# simple in-memory transpose
# -d sep
# Input:
# data to be transposed
# Output:
# transposed data
perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
use Text::CSV;
use Getopt::Long;
Getopt::Long::Configure("bundling","require_order");
my $retval = GetOptions("debug|D=s" => \$opt::debug,
"delimiter|d=s" => \$opt::delimiter,
"verbose|v" => \@opt::verbose,
"simple|s" => \$opt::simple,
);
if(defined $opt::delimiter) {
simple();
} else {
die("-d must be set");
}
sub simple {
my (@table);
my $col = 0;
my $csv_setting = { binary => 1, sep_char => $opt::delimiter };
my $sep = $csv_setting->{sep_char};
my $csv = Text::CSV->new($csv_setting)
or die "Cannot use CSV: ".Text::CSV->error_diag ();
while(my $l = <>) {
if(not $csv->parse($l)) {
die "CSV has unexpected format";
}
# append to each row
my $row = 0;
for($csv->fields()) {
$table[$row][$col] = defined($_) ? $_ : '';
$row++;
}
$col++;
}
print map { join($sep,@$_),"\n" } @table;
}
cut-here-UbsAqi0j6GoOuk5W5yWA
) "$@"
}
export -f transpose_inner
stdin_to_paste_files() {
# Run transpose_inner on blocks from stdin
# output each block as file name
local block_size
local sep
block_size="$1"
sep="$2"
PARALLEL="-k --files --block $block_size" \
parallel --pipe transpose_inner -d "'$sep'"
}
file_to_paste_files() {
# Run transpose_inner on blocks from $file
# output each block as file name
local block_size
local sep
block_size="$1"
sep="$2"
file="$3"
PARALLEL="-k --files --block $block_size" \
parallel --pipe-part -a "$file" transpose_inner -d "'$sep'"
}
super_paste() {
# Like 'paste' up to 1000000 files
# The files are read from stdin
local sep
local paste_files
local fifo
sep="$1"
paste_files=`tempfile`
# basename
fifo=`tempfile`
rm $fifo
cat > $paste_files
# Define replacement string {0#} to 0-pad job number
PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
$_=sprintf("%0${f}d",seq())'\'
# Make fifos that can be read from
cat $paste_files | parallel -n1000 "rm -f $fifo{0#}; mkfifo $fifo{0#}"
# Start a paste process for every 1000 files
cat $paste_files | parallel -n1000 -j0 "paste -d '$sep' {} > $fifo{0#}" &
# Paste all the fifos
eval paste -d "'$sep'" $fifo*
# Cleanup
cat $paste_files | parallel -n1000 "rm -f {} $fifo{0#}"
rm $paste_files
}
stdin_detect_sep() {
# Read the first 3 lines and detect the separator
# Save the read input to file
local file
file="$1"
# TODO
echo "$d"
}
usage() {
echo "Usage: $0 [-d delimiter] [-b blocksize]" 1>&2; exit 1;
}
block_size=10M
while getopts ":b:d:" o; do
case "${o}" in
d)
d="$(printf "${OPTARG}")"
if [ "'" = "${d}" ] ; then
echo "Delimiter cannot be '"
usage
exit
fi
;;
b)
block_size="${OPTARG}"
;;
*)
usage
;;
esac
done
shift $((OPTIND-1))
if [ -z "${d}" ] ; then
d="$(printf "\t")"
fi
# Sep cannot be '
file="$@"
first_lines=`tempfile`
if [ -z "$file" ]; then
sep="$(stdin_detect_sep $first_lines)"
(cat $first_lines; rm $first_lines; cat) |
stdin_to_paste_files $block_size "$sep" | super_paste "$sep"
else
sep="$(stdin_detect_sep < "$file" $first_lines)"
rm $first_lines
file_to_paste_files $block_size "$sep" "$file" | super_paste "$sep"
fi