#!/usr/bin/perl -w use English; use FileCache; use File::Temp; my $delimiter = shift; my $buffer = shift; # Use at most 1000M before flushing $buffer ||= 1000_000_000; # Perl makes the buffer baloon to 10 times the requested value $buffer /= 10; $max_col_size = $buffer; my $delimiter_regexp = $delimiter; $delimiter_regexp =~ s/(\W)/\\$1/g; my @current; my $last_t = 0; my $lineno = 0; my %col; while(<>) { chomp; # Split current line into columns @current = split /$delimiter_regexp/o, $_; my $t = 0; map { push(@{$col{$t}},$_); $col_size{$t} += length $_; if($col_size{$t} > $max_col_size) { flush(\%col,$t); $col_size{$t} = 0; } $t++; } @current; if($t != $last_t) { if(0 == $last_t) { $last_t = $t; $max_col_size = $buffer/$last_t; } else { warning("Number of columns in line $NR: $t != $last_t\n"); } } } flush(\%col); merge(); sub flush { my $col_ref = shift; my @cols_to_flush = @_; if(not @cols_to_flush) { @cols_to_flush = keys %$col_ref; } for my $c (@cols_to_flush) { $Global::tempfile{$c} ||= tmpnam(); my $fh = cacheout $Global::tempfile{$c}; # This will print one delimiter too much, which we will deal with later print $fh map { $_,$delimiter } @{$col_ref->{$c}}; delete $col_ref->{$c}; } } sub merge { for my $c (sort keys %Global::tempfile) { my $fh = cacheout $Global::tempfile{$c}; # truncate by length of delimiter to get rid of the last $delimiter seek $fh,-length($delimiter),SEEK_END; truncate $fh, tell $fh; # Make sure the file is closed of writing close $fh; open($fh, "<", $Global::tempfile{$c}) || die; my $buf; while(sysread($fh,$buf,1000_000)) { print $buf; } print "\n"; unlink $Global::tempfile{$c}; } } sub warning { my @w = @_; print STDERR "transpose: Warning: ", @w; } sub error { my @w = @_; print STDERR "transpose: Error: ", @w; }