From ff313b457fea32e2c0a09b5fc6169a490cc7bc3a Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Tue, 9 Apr 2013 11:53:24 +0200 Subject: [PATCH] transpose: transpose csv --- transpose/transpose | 88 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100755 transpose/transpose diff --git a/transpose/transpose b/transpose/transpose new file mode 100755 index 0000000..6ddf6e5 --- /dev/null +++ b/transpose/transpose @@ -0,0 +1,88 @@ +#!/usr/bin/perl -w + +use English; +use FileCache; +use File::Temp; + +my $delimiter = shift; +my $buffer = shift; + +# Use at most 1000M before flushing +$buffer ||= 1000_000_000; +# Perl makes the buffer baloon to 10 times the requested value +$buffer /= 10; +$max_col_size = $buffer; +my $delimiter_regexp = $delimiter; +$delimiter_regexp =~ s/(\W)/\\$1/g; +my @current; +my $last_t = 0; +my $lineno = 0; +my %col; +while(<>) { + chomp; + # Split current line into columns + @current = split /$delimiter_regexp/o, $_; + my $t = 0; + map { + push(@{$col{$t}},$_); + $col_size{$t} += length $_; + if($col_size{$t} > $max_col_size) { + flush(\%col,$t); + $col_size{$t} = 0; + } + $t++; + } @current; + if($t != $last_t) { + if(0 == $last_t) { + $last_t = $t; + $max_col_size = $buffer/$last_t; + } else { + warning("Number of columns in line $NR: $t != $last_t\n"); + } + } +} +flush(\%col); +merge(); + +sub flush { + my $col_ref = shift; + my @cols_to_flush = @_; + if(not @cols_to_flush) { + @cols_to_flush = keys %$col_ref; + } + for my $c (@cols_to_flush) { + $Global::tempfile{$c} ||= tmpnam(); + my $fh = cacheout $Global::tempfile{$c}; + # This will print one delimiter too much, which we will deal with later + print $fh map { $_,$delimiter } @{$col_ref->{$c}}; + delete $col_ref->{$c}; + } +} + +sub merge { + for my $c (sort keys %Global::tempfile) { + my $fh = cacheout $Global::tempfile{$c}; + # truncate by length of delimiter to get rid of the last $delimiter + seek $fh,-length($delimiter),SEEK_END; + truncate $fh, tell $fh; + # Make sure the file is closed of writing + close $fh; + open($fh, "<", $Global::tempfile{$c}) || die; + my $buf; + while(sysread($fh,$buf,1000_000)) { + print $buf; + } + print "\n"; + unlink $Global::tempfile{$c}; + } +} + +sub warning { + my @w = @_; + print STDERR "transpose: Warning: ", @w; +} + +sub error { + my @w = @_; + print STDERR "transpose: Error: ", @w; +}