transpose: Autofind separator.
This commit is contained in:
parent
39f19757ae
commit
66be041e30
9
Makefile
9
Makefile
|
@ -1,6 +1,6 @@
|
|||
CMD = blink 2grep 2search burncpu drac duplicate-packets em emoticons \
|
||||
encdir field find-first-fail forever fxkill G gitnext gitundo \
|
||||
goodpasswd histogram Loffice mtrr mirrorpdf neno off parsort \
|
||||
goodpasswd histogram Loffice mtrr mirrorpdf neno not off \
|
||||
pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean \
|
||||
rina rn rrm seekmaniac shython sound-reload splitvideo stdout \
|
||||
swapout T teetime timestamp tracefile transpose upsidedown \
|
||||
|
@ -12,10 +12,9 @@ all: blink/blink.1 2search/2grep.1 2search/2search.1 \
|
|||
find-first-fail/find-first-fail.1 G/G.1 gitnext/gitnext.1 \
|
||||
gitundo/gitundo.1 goodpasswd/goodpasswd.1 \
|
||||
histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \
|
||||
off/off.1 parsort/parsort.1 pdfman/pdfman.1 pidcmd/pidcmd.1 \
|
||||
pidtree/pidtree.1 plotpipe/plotpipe.1 puniq/puniq.1 \
|
||||
rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 \
|
||||
seekmaniac/seekmaniac.1 shython/shython.1 \
|
||||
off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1 \
|
||||
plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1 \
|
||||
rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1 \
|
||||
sound-reload/sound-reload.1 splitvideo/splitvideo.1 \
|
||||
stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1 \
|
||||
tracefile/tracefile.1 transpose/transpose.1 T/T.1 \
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
#!/bin/bash
|
||||
|
||||
: <<=cut
|
||||
: <<'_EOS'
|
||||
=pod
|
||||
=cut
|
||||
|
||||
=head1 NAME
|
||||
|
||||
|
@ -9,30 +10,40 @@ transpose - transpose CSV file
|
|||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input>]
|
||||
... | B<transpose> [-d I<delim>] [-b I<blocksize>]
|
||||
|
||||
B<transpose> [-d I<delim>] [-b I<blocksize>] [I<input.csv>]
|
||||
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
B<transpose> will read a CSV fie
|
||||
B<transpose> will read a CSV file and write the transposed version of
|
||||
the file. I.e. rows will be columns, and columns will be rows.
|
||||
|
||||
=head1 OPTIONS
|
||||
|
||||
=over 9
|
||||
|
||||
=item I<input>
|
||||
=item I<input.csv>
|
||||
|
||||
Input CSV file. If none is given reads from STDIN (standard input).
|
||||
Input CSV file. If none is given B<transpose> reads from STDIN
|
||||
(standard input).
|
||||
|
||||
|
||||
=item B<-d> I<delim>
|
||||
|
||||
Use I<delim> as delimiter in input and output.
|
||||
Use I<delim> as delimiter in input and output. If no delimiter is
|
||||
given, B<transpose> will read the first 3 rows and try to guess the
|
||||
delimiter.
|
||||
|
||||
The autodetection does not work well if values contain a quoted
|
||||
delimiter: E.g. a,"value with quoted ,",other value
|
||||
|
||||
|
||||
=item B<-b> I<blocksize>
|
||||
|
||||
Pass chunks of I<blocksize> bytes to the internal transposer. Memory
|
||||
usage will be 10 times I<blocksiz> per CPU core. Default is 10M.
|
||||
usage will be 10 times I<blocksize> per CPU core. Default is 100M.
|
||||
|
||||
|
||||
=back
|
||||
|
@ -40,18 +51,60 @@ usage will be 10 times I<blocksiz> per CPU core. Default is 10M.
|
|||
|
||||
=head1 EXAMPLES
|
||||
|
||||
=head2 EXAMPLE: Transpose a medium sized TSV file
|
||||
=head2 EXAMPLE: Transpose a TSV file
|
||||
|
||||
cat normal.tsv | transpose -d '\t' > transposed.tsv
|
||||
|
||||
=head2 EXAMPLE: transpose 57 GB 100000x100000 CSV file
|
||||
|
||||
# Generate 100000x100000 matrix
|
||||
100kx100k() {
|
||||
100000x() {
|
||||
while seq 123456 | shuf; do true; done |
|
||||
paste $(perl -e 'print map {"- "} 1..100000') |
|
||||
head -n $1
|
||||
}
|
||||
export -f 100000x
|
||||
seq 1000 | parallel --nice 18 --delay 0.05 --files 100000x 100 |
|
||||
parallel -uj1 'cat {}; nice rm {} &'
|
||||
}
|
||||
100kx100k > 100kx100k
|
||||
# Transpose it
|
||||
transpose 100kx100k > 100kx100k.t
|
||||
|
||||
This takes around 700 MB/core and 20 minutes to run on 64C64T.
|
||||
|
||||
|
||||
=head1 LIMITATIONS
|
||||
|
||||
B<transpose> is tested on a 1000000x1000000 3.6 TB table.
|
||||
|
||||
There is a limit on how many filehandles can be used in super_paste.
|
||||
This is probably in the order of 1000000. This limits is the number of
|
||||
temporary files. By increasing the block size the number of temporary
|
||||
files will be lowered. The 3.6 TB test resulted in 36000 files, so if
|
||||
the limit is 1000000 files, it should work fine up to 100 TB before
|
||||
you need to increase the block size.
|
||||
|
||||
cat medium.tsv | transpose -d '\t' > muidem.tsv
|
||||
|
||||
=head1 DESIGN
|
||||
|
||||
B<transpose> is designed to deal efficiently with medium sized data
|
||||
(up to 30 TB per file) on systems with 100 MB RAM per CPU core. It
|
||||
works by chopping the input into 10 MB blocks. Each block is
|
||||
(up to 30 TB files) on systems with 2 GB RAM per CPU core. It works by
|
||||
chopping the input into blocks (default: 100 MB). Each block is
|
||||
transposed in parallel and saved to disk. Then these files are pasted
|
||||
together and finally removed.
|
||||
|
||||
B<transpose> uses B<csvtool> if installed and a (slower) perl script
|
||||
otherwise.
|
||||
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
B<transpose> makes files in $TMPDIR (default: /tmp). These are not
|
||||
cleaned up, if B<transpose> is stopped abnormally (e.g. killed).
|
||||
|
||||
|
||||
=head1 REPORTING BUGS
|
||||
|
||||
Report bugs to <tange@gnu.org>.
|
||||
|
@ -59,7 +112,7 @@ Report bugs to <tange@gnu.org>.
|
|||
|
||||
=head1 AUTHOR
|
||||
|
||||
Copyright (C) 2013-2018 Ole Tange, http://ole.tange.dk and Free
|
||||
Copyright (C) 2013-2020 Ole Tange, http://ole.tange.dk and Free
|
||||
Software Foundation, Inc.
|
||||
|
||||
|
||||
|
@ -178,22 +231,34 @@ B<transpose> uses Perl, B<paste>, B<bash> and B<parallel>.
|
|||
|
||||
=head1 SEE ALSO
|
||||
|
||||
B<bash>(1), B<parallel>(1), B<paste>(1)
|
||||
B<csvtool>(1), B<bash>(1), B<parallel>(1), B<paste>(1)
|
||||
|
||||
=cut
|
||||
_EOS
|
||||
#'
|
||||
|
||||
# Timings: 100kx200k (114GB) 34min
|
||||
# 200kx200k (228GB) 63min
|
||||
|
||||
# transpose [-d delimiter] [-b blocksize] table.csv > transposed.csv
|
||||
# cat table.csv | transpose [-d delimiter] [-b blocksize] > transposed.csv
|
||||
|
||||
transpose_inner() {
|
||||
# simple in-memory transpose
|
||||
# -d sep
|
||||
# Input:
|
||||
transpose_perl() {
|
||||
# Simple in-memory transpose
|
||||
# Standard input:
|
||||
# data to be transposed
|
||||
# Output:
|
||||
# Standard output:
|
||||
# transposed data
|
||||
perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
|
||||
sep="$1"
|
||||
shift
|
||||
|
||||
if [ "$sep" == '\s+' ] ; then
|
||||
# Multiple spaces = separator
|
||||
space_merger() { tr ' ' ' ' | tr -s ' '; }
|
||||
sep=" "
|
||||
else
|
||||
space_merger() { cat; }
|
||||
fi
|
||||
space_merger | perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
|
||||
#!/usr/bin/perl
|
||||
|
||||
use Text::CSV;
|
||||
use Getopt::Long;
|
||||
|
||||
|
@ -233,103 +298,225 @@ sub simple {
|
|||
print map { join($sep,@$_),"\n" } @table;
|
||||
}
|
||||
cut-here-UbsAqi0j6GoOuk5W5yWA
|
||||
) "$@"
|
||||
) -d "$sep" "$@"
|
||||
}
|
||||
export -f transpose_inner
|
||||
export -f transpose_perl
|
||||
|
||||
stdin_to_paste_files() {
|
||||
# Run transpose_inner on blocks from stdin
|
||||
# output each block as file name
|
||||
local block_size
|
||||
local sep
|
||||
block_size="$1"
|
||||
sep="$2"
|
||||
PARALLEL="-k --files --block $block_size" \
|
||||
parallel --pipe transpose_inner -d "'$sep'"
|
||||
transpose_csvtool() {
|
||||
# Use cvstool to transpose
|
||||
# Standard input:
|
||||
# data to be transposed
|
||||
# Standard output:
|
||||
# transposed data
|
||||
sep="$1"
|
||||
if [ "$sep" == "\s+" ] ; then
|
||||
# Multiple spaces = separator
|
||||
tr ' ' ' ' | tr -s ' ' |
|
||||
csvtool transpose -t " " -u " " -
|
||||
else
|
||||
csvtool transpose -t "$sep" -u "$sep" -
|
||||
fi
|
||||
}
|
||||
export -f transpose_csvtool
|
||||
|
||||
detect_transposer() {
|
||||
# Find the fastest transpose tool installed
|
||||
if bash -c 'echo 1 | transpose_csvtool ,' >/dev/null 2>&1; then
|
||||
echo transpose_csvtool
|
||||
else
|
||||
if bash -c 'echo 1 | transpose_perl ,' >/dev/null 2>&1; then
|
||||
echo transpose_perl
|
||||
else
|
||||
echo Error: You need the Perl Text::CSV module or csvtool. >&2
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
file_to_paste_files() {
|
||||
# Run transpose_inner on blocks from $file
|
||||
# output each block as file name
|
||||
local block_size
|
||||
local sep
|
||||
make_paste_files() {
|
||||
# Transpose input in blocks
|
||||
# Output:
|
||||
# each transposed block as file name
|
||||
block_size="$1"
|
||||
sep="$2"
|
||||
file="$3"
|
||||
PARALLEL="-k --files --block $block_size" \
|
||||
parallel --pipe-part -a "$file" transpose_inner -d "'$sep'"
|
||||
transposer=$(detect_transposer)
|
||||
par_opt="-k --files --block $block_size"
|
||||
if [ -z "$file" ]; then
|
||||
parallel $par_opt --pipe "$transposer" "'$sep'"
|
||||
else
|
||||
parallel $par_opt --pipe-part -a "$file" "$transposer" "'$sep'"
|
||||
fi
|
||||
}
|
||||
|
||||
super_paste() {
|
||||
# Like 'paste' up to 1000000 files
|
||||
# More than 250000 files requires extra filehandles for GNU Parallel
|
||||
# The files are read from stdin
|
||||
local sep
|
||||
local paste_files
|
||||
local fifo
|
||||
cleanup() {
|
||||
printf "\rSIGINT caught \n" >&2
|
||||
(rm -rf "$TMPDIR" &)&
|
||||
exit 1
|
||||
}
|
||||
|
||||
other_commands() {
|
||||
printf "\rSIGINT caught "
|
||||
ls -l $paste_files
|
||||
cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
|
||||
rm $paste_files
|
||||
}
|
||||
|
||||
trap 'other_commands' SIGINT
|
||||
trap 'cleanup' SIGINT
|
||||
|
||||
sep="$1"
|
||||
paste_files=`tempfile`
|
||||
paste_files=$(tempfile)
|
||||
# basename
|
||||
fifo=`tempfile`
|
||||
rm $fifo
|
||||
fifo=$(tempfile)
|
||||
rm "$fifo"
|
||||
# Group files from stdin in groups of 1000 files
|
||||
parallel -k -n1000 echo > $paste_files
|
||||
parallel -k -n1000 echo > "$paste_files"
|
||||
|
||||
# Define replacement string {0#} to 0-pad job number
|
||||
export PARALLEL="--rpl "\''{0#} $f=1+int("".(log(total_jobs())/log(10)));
|
||||
$_=sprintf("%0${f}d",seq())'\'
|
||||
|
||||
# Make fifos that can be read from
|
||||
cat $paste_files | parallel "rm -f $fifo{0#}; mkfifo $fifo{0#}"
|
||||
cat "$paste_files" | parallel "rm -f '$fifo'{0#}; mkfifo '$fifo'{0#}"
|
||||
|
||||
# Start a paste process for every 1000 files
|
||||
cat $paste_files | parallel -j0 "eval paste -d \''$sep'\' {} > $fifo{0#}" &
|
||||
cat "$paste_files" | parallel -j0 "eval paste -d \''$sep'\' {} > '$fifo'{0#}" &
|
||||
|
||||
# Paste all the fifos
|
||||
eval paste -d "'$sep'" $fifo*
|
||||
eval paste -d "'$sep'" "$fifo"*
|
||||
|
||||
# Cleanup
|
||||
cat $paste_files | parallel "eval rm -f {} $fifo{0#}"
|
||||
rm $paste_files
|
||||
cat "$paste_files" | parallel "eval rm -f {} '$fifo'{0#}"
|
||||
rm "$paste_files"
|
||||
}
|
||||
|
||||
stdin_detect_sep() {
|
||||
# Read the first 3 lines and detect the separator
|
||||
# Save the read input to file
|
||||
local file
|
||||
file="$1"
|
||||
# TODO
|
||||
echo "$d"
|
||||
# Read the first 3 lines of stdin and detect the separator
|
||||
# Only , space tab ; | : \0 and whitespace are detected
|
||||
# Save the 3 lines input to file so it can be read again later
|
||||
perl <(cat <<'cut-here-UbsAqi0j6GoOuk5W5yWA'
|
||||
#!/usr/bin/perl
|
||||
|
||||
sub max(@) {
|
||||
# Returns:
|
||||
# Maximum value of array
|
||||
my $max;
|
||||
for (@_) {
|
||||
# Skip undefs
|
||||
defined $_ or next;
|
||||
defined $max or do { $max = $_; next; }; # Set $_ to the first non-undef
|
||||
$max = ($max > $_) ? $max : $_;
|
||||
}
|
||||
return $max;
|
||||
}
|
||||
|
||||
sub find_sep(@) {
|
||||
# Try common find the separators.
|
||||
# Do we get the same for each line?
|
||||
my @csv = grep { not /^#/ } @_;
|
||||
my @sep = (",", "\t", ";", ' ', '\|', ':', "\0", '\s+');
|
||||
my $columns;
|
||||
my %col;
|
||||
for my $sep (@sep) {
|
||||
for my $line (@csv) {
|
||||
$columns = split /$sep/, $line;
|
||||
if($columns > 1) {
|
||||
$col{$sep."\0".$columns}++
|
||||
}
|
||||
}
|
||||
}
|
||||
# Find max $col{$sep,$columns}
|
||||
my $most_lines = max(values %col);
|
||||
|
||||
my %sepcol = (map { split /\0/, $_ }
|
||||
grep { $col{$_} == $most_lines } keys %col);
|
||||
my $most_cols = max(values %sepcol);
|
||||
return ((grep { $sepcol{$_} == $most_cols } keys %sepcol)[0]);
|
||||
}
|
||||
|
||||
my $buf = "";
|
||||
my $newlines = 0;
|
||||
open(OUT, "+>", shift) || die;
|
||||
# Copy (at least) 3 lines to OUT
|
||||
while(sysread(STDIN,$buf,131072)) {
|
||||
print OUT $buf;
|
||||
$newlines += $buf =~ tr/\n/\n/;
|
||||
if($newlines >= 3) {
|
||||
last;
|
||||
}
|
||||
}
|
||||
seek(OUT,0,0) || die;
|
||||
my @lines = <OUT>;
|
||||
close OUT;
|
||||
# Remove last half-line
|
||||
pop @lines;
|
||||
print find_sep(@lines);
|
||||
|
||||
cut-here-UbsAqi0j6GoOuk5W5yWA
|
||||
) "$@"
|
||||
}
|
||||
|
||||
matrix() {
|
||||
# Generate table X by Y
|
||||
row="$1"
|
||||
col="$2"
|
||||
sep="$3"
|
||||
|
||||
mxn() { perl -E 'for(1..'$1') { say join "'$3'", map {int(rand()*123456)} 1..'$2' } '; }
|
||||
export -f mxn
|
||||
thr=$(parallel --number-of-threads)
|
||||
prow=$((row/thr/100))
|
||||
seq $((thr*100)) | parallel mxn $prow $col $sep
|
||||
mxn $((row-prow*thr*100)) $col $sep
|
||||
}
|
||||
|
||||
demo() {
|
||||
# Generate table X by Y
|
||||
row="$1"
|
||||
col="$2"
|
||||
sep="$3"
|
||||
|
||||
# Generate string "- - - - " for each col
|
||||
paste_format_string=$(perl -e 'print map {"- "} 1..'$col)
|
||||
# Generate dummy values
|
||||
while seq 123456; do true; done |
|
||||
# Use paste's format string to make $col columns
|
||||
paste -d "$sep" $paste_format_string |
|
||||
# Keep the top $row rows
|
||||
head -n "$row"
|
||||
}
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 [-d delimiter] [-b blocksize]" 1>&2; exit 1;
|
||||
}
|
||||
|
||||
version() {
|
||||
cat <<EOF
|
||||
transpose 20201130
|
||||
Copyright (C) 2020 Ole Tange, http://ole.tange.dk
|
||||
License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>
|
||||
This is free software: you are free to change and redistribute it.
|
||||
GNU parallel comes with no warranty.
|
||||
|
||||
Web site: https://gitlab.com/ole.tange/tangetools/-/tree/master/transpose
|
||||
EOF
|
||||
}
|
||||
|
||||
main() {
|
||||
block_size=10M
|
||||
while getopts ":b:d:" o; do
|
||||
case "${o}" in
|
||||
block_size=100M
|
||||
while getopts ":b:d:V" o; do
|
||||
case "$o" in
|
||||
d)
|
||||
d="$(printf "${OPTARG}")"
|
||||
if [ "'" = "${d}" ] ; then
|
||||
# Convert \t to TAB using printf
|
||||
d=$(printf "$OPTARG")
|
||||
if [ "'" = "$d" ] ; then
|
||||
echo "Delimiter cannot be '"
|
||||
usage
|
||||
exit
|
||||
exit 0
|
||||
fi
|
||||
;;
|
||||
b)
|
||||
block_size="${OPTARG}"
|
||||
block_size="$OPTARG"
|
||||
;;
|
||||
V)
|
||||
version
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
|
@ -337,22 +524,46 @@ main() {
|
|||
esac
|
||||
done
|
||||
shift $((OPTIND-1))
|
||||
|
||||
if [ -z "${d}" ] ; then
|
||||
d="$(printf "\t")"
|
||||
fi
|
||||
|
||||
file="$@"
|
||||
first_lines=`tempfile`
|
||||
file="$1"
|
||||
sep="$d"
|
||||
# Put all tempfiles into a single dir
|
||||
export TMPDIR=`mktemp -d`
|
||||
first_lines=$(tempfile)
|
||||
if [ -z "$file" ]; then
|
||||
sep="$(stdin_detect_sep $first_lines)"
|
||||
(cat $first_lines; rm $first_lines; cat) |
|
||||
stdin_to_paste_files $block_size "$sep" | super_paste "$sep"
|
||||
else
|
||||
sep="$(stdin_detect_sep < "$file" $first_lines)"
|
||||
rm $first_lines
|
||||
file_to_paste_files $block_size "$sep" "$file" | super_paste "$sep"
|
||||
if [ -z "$sep" ] ; then
|
||||
sep=$(stdin_detect_sep $first_lines)
|
||||
if [ -z "$sep" ] ; then
|
||||
echo "transpose: Cannot autodetect separator. Use -d" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
if [ "$sep" == '\s+' ] ; then
|
||||
# Multiple spaces = separator
|
||||
osep=" "
|
||||
else
|
||||
osep="$sep"
|
||||
fi
|
||||
# Prepend stdin with the lines read in stdin_detect_sep
|
||||
(cat "$first_lines"; rm "$first_lines"; cat) |
|
||||
make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
|
||||
else
|
||||
if [ -z "$sep" ] ; then
|
||||
sep=$(stdin_detect_sep < "$file" "$first_lines")
|
||||
if [ -z "$sep" ] ; then
|
||||
echo "transpose: Cannot autodetect separator. Use -d" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
rm "$first_lines"
|
||||
if [ "$sep" == '\s+' ] ; then
|
||||
# Multiple spaces = separator
|
||||
osep=" "
|
||||
else
|
||||
osep="$sep"
|
||||
fi
|
||||
make_paste_files "$block_size" "$sep" "$@" | super_paste "$osep"
|
||||
fi
|
||||
rmdir "$TMPDIR" 2>/dev/null
|
||||
}
|
||||
|
||||
# Make sure the whole file is read before starting
|
||||
|
|
|
@ -1,94 +0,0 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
use Text::CSV;
|
||||
use File::Temp qw(tempfile tempdir);
|
||||
|
||||
my $csv;
|
||||
my (@table);
|
||||
my $first_line = 1;
|
||||
my $col = 0;
|
||||
while(my $l = <>) {
|
||||
if($first_line) {
|
||||
my $csv_setting = guess_csv_setting($l);
|
||||
$csv = Text::CSV->new($csv_setting)
|
||||
or die "Cannot use CSV: ".Text::CSV->error_diag ();
|
||||
$first_line = 0;
|
||||
}
|
||||
if(not $csv->parse($l)) {
|
||||
die "CSV has unexpected format";
|
||||
}
|
||||
# append to each row
|
||||
my $row = 0;
|
||||
|
||||
for($csv->fields()) {
|
||||
$table[$row][$col] = defined($_) ? $_ : '';
|
||||
$row++;
|
||||
}
|
||||
$col++;
|
||||
}
|
||||
|
||||
print map { join("\t",@$_),"\n" } @table;
|
||||
|
||||
sub guess_csv_setting {
|
||||
# Based on two lines guess the csv_setting
|
||||
my $line = shift;
|
||||
# Potential field separators
|
||||
# Priority:
|
||||
# \0 if both lines have the same number
|
||||
# \t if both lines have the same number
|
||||
my @fieldsep = (",", "\t", "\0", ":", ";", "|", "/");
|
||||
my %count;
|
||||
@count{@fieldsep} = (0,0,0,0,0,0);
|
||||
# Count characters
|
||||
map { $count{$_}++ } split //,$line;
|
||||
my @sepsort = sort { $count{$b} <=> $count{$a} } @fieldsep;
|
||||
my $guessed_sep;
|
||||
if($count{"\0"} > 0) {
|
||||
# \0 is in the line => this is definitely the field sep
|
||||
$guessed_sep = "\0";
|
||||
} elsif($count{"\t"} > 0) {
|
||||
# \t is in the line => this is definitely the field sep
|
||||
$guessed_sep = "\t";
|
||||
} else {
|
||||
$guessed_sep = $sepsort[0];
|
||||
}
|
||||
return { binary => 1, sep_char => $guessed_sep };
|
||||
}
|
||||
|
||||
sub _guess_csv_setting {
|
||||
# Try different csv_settings
|
||||
# Return a $csv object with the best setting
|
||||
my @csv_file_types =
|
||||
( { binary => 1, sep_char => "\0" },
|
||||
{ binary => 1, sep_char => "\t" },
|
||||
{ binary => 1, sep_char => "," },
|
||||
{ binary => 1 },
|
||||
);
|
||||
|
||||
my $succesful_csv_type;
|
||||
my $csv;
|
||||
for my $csv_file_type (@csv_file_types) {
|
||||
$csv = Text::CSV->new ( $csv_file_type )
|
||||
or die "Cannot use CSV: ($csv_file_type) ".Text::CSV->error_diag ();
|
||||
$succesful_csv_type = $csv_file_type;
|
||||
my $last_n_fields;
|
||||
for my $line (@lines) {
|
||||
if($csv->parse($line)) {
|
||||
my $n_fields = ($csv->fields());
|
||||
$last_fields ||= $n_fields;
|
||||
|
||||
} else{
|
||||
$succesful_csv_type = 0;
|
||||
last;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if(not $succesful_csv_type) {
|
||||
$csv->error_diag();
|
||||
}
|
||||
|
||||
$csv = Text::CSV->new ( $succesful_csv_type ) # should set binary attribute.
|
||||
or die "Cannot use CSV: ".Text::CSV->error_diag ();
|
||||
return($csv);
|
||||
}
|
Loading…
Reference in a new issue