parallel: --csv initial version.

This commit is contained in:
Ole Tange 2018-04-21 21:50:42 +02:00
parent 8b050b68d4
commit bbcef1032c
4 changed files with 85 additions and 15 deletions

View file

@ -1047,6 +1047,7 @@ sub options_hash {
"max-args|maxargs|n=i" => \$opt::max_args,
"max-replace-args|N=i" => \$opt::max_replace_args,
"colsep|col-sep|C=s" => \$opt::colsep,
"csv"=> \$opt::csv,
"help|h" => \$opt::help,
"L=f" => \$opt::L,
"max-lines|l:f" => \$opt::max_lines,
@ -1175,6 +1176,14 @@ sub parse_options {
}
if(defined $opt::tmuxpane) { $opt::tmux = $opt::tmuxpane; }
if(defined $opt::colsep) { $Global::trim = 'lr'; }
if(defined $opt::csv) {
$Global::use{"Text::CSV"} ||= eval "use Text::CSV; 1;";
$opt::colsep = defined $opt::colsep ? $opt::colsep : ",";
my $csv_setting = { binary => 1, sep_char => $opt::colsep };
my $sep = $csv_setting->{sep_char};
$Global::csv = Text::CSV->new($csv_setting)
or die "Cannot use CSV: ".Text::CSV->error_diag ();
}
if(defined $opt::header) {
$opt::colsep = defined $opt::colsep ? $opt::colsep : "\t";
}
@ -1242,11 +1251,11 @@ sub parse_options {
# Is the output a dir or CSV-file?
if($opt::results =~ /\.csv$/i) {
# CSV with , as separator
$Global::csv = ",";
$Global::csvsep = ",";
$Global::membuffer ||= 1;
} elsif($opt::results =~ /\.tsv$/i) {
# CSV with TAB as separator
$Global::csv = "\t";
$Global::csvsep = "\t";
$Global::membuffer ||= 1;
}
}
@ -1254,7 +1263,7 @@ sub parse_options {
my ($compress, $decompress) = find_compression_program();
$opt::compress_program ||= $compress;
$opt::decompress_program ||= $decompress;
if(($opt::results and not $Global::csv) or $opt::files) {
if(($opt::results and not $Global::csvsep) or $opt::files) {
# No need for decompressing
$opt::decompress_program = "cat >/dev/null";
}
@ -4098,7 +4107,7 @@ sub usage {
"If you use programs that use GNU Parallel to process data for an article in a",
"scientific publication, please cite:",
"",
" O. Tange (2018): GNU Parallel 2018, Apr 2018, ISBN 9781387509881,",
" O. Tange (2018): GNU Parallel 2018, Mar 2018, ISBN 9781387509881,",
" DOI https://doi.org/10.5281/zenodo.1146014",
"",
"This helps funding further development; AND IT WON'T COST YOU A CENT.",
@ -4126,7 +4135,7 @@ sub citation_notice {
"If you use programs that use GNU Parallel to process data for an article in a",
"scientific publication, please cite:",
"",
" O. Tange (2018): GNU Parallel 2018, Apr 2018, ISBN 9781387509881,",
" O. Tange (2018): GNU Parallel 2018, Mar 2018, ISBN 9781387509881,",
" DOI https://doi.org/10.5281/zenodo.1146014",
"",
"This helps funding further development; AND IT WON'T COST YOU A CENT.",
@ -4220,7 +4229,7 @@ sub citation {
" author = {Tange, Ole},",
" title = {GNU Parallel 2018},",
" publisher = {Ole Tange},",
" month = Apr,",
" month = Mar,",
" year = 2018,",
" ISBN = {9781387509881},",
" doi = {10.5281/zenodo.1146014},",
@ -7146,7 +7155,7 @@ sub openoutputfiles {
}
# Return immediately because we do not need setting filenames
return;
} elsif($opt::results and not $Global::csv) {
} elsif($opt::results and not $Global::csvsep) {
my $out = $self->{'commandline'}->results_out();
my $seqname;
if($out eq $opt::results or $out =~ m:/$:) {
@ -8841,7 +8850,7 @@ sub print {
if($opt::sqlworker and not $opt::results) {
$Global::sql->output($self);
}
if($Global::csv) {
if($Global::csvsep) {
# Add output to CSV when finished
$self->print_csv();
}
@ -8905,7 +8914,7 @@ sub print {
sub combine_ref {
# Inspired by Text::CSV_PP::_combine (by Makamaka Hannyaharamitu)
my @part = @_;
my $sep = $Global::csv;
my $sep = $Global::csvsep;
my $quot = '"';
my @out = ();
@ -9008,7 +9017,7 @@ sub print_linebuffer {
}
}
if(not $self->virgin()) {
if($opt::files or ($opt::results and not $Global::csv)) {
if($opt::files or ($opt::results and not $Global::csvsep)) {
# Print filename
if($fdno == 1 and not $self->fh($fdno,"printed")) {
print $out_fd $self->tag(),$self->fh($fdno,"name"),"\n";
@ -9055,7 +9064,7 @@ sub print_linebuffer {
$self->add_returnsize($outputlength);
}
if(defined $self->{'exitstatus'}) {
if($opt::files or ($opt::results and not $Global::csv)) {
if($opt::files or ($opt::results and not $Global::csvsep)) {
$self->add_returnsize(-s $self->fh($fdno,"name"));
} else {
# If the job is dead: print the remaining partial line
@ -10459,7 +10468,7 @@ sub new {
# Open SQL table
$arg_sub_queue = SQLRecordQueue->new();
} elsif(defined $colsep) {
# Open one file with colsep
# Open one file with colsep or CSV
$arg_sub_queue = RecordColQueue->new($fhs);
} else {
# Open one or more files if multiple -a
@ -10561,8 +10570,19 @@ sub get {
my $line = $arg->orig();
::debug("run", "line='$line'\n");
if($line ne "") {
for my $s (split /$opt::colsep/o, $line, -1) {
push @out_record, Arg->new($s);
if($opt::csv) {
# Parse CSV
chomp $line;
if(not $Global::csv->parse($line)) {
die "CSV has unexpected format: ^$line^";
}
for($Global::csv->fields()) {
push @out_record, Arg->new($_);
}
} else {
for my $s (split /$opt::colsep/o, $line, -1) {
push @out_record, Arg->new($s);
}
}
} else {
push @out_record, Arg->new("");
@ -10797,15 +10817,30 @@ sub read_arg_from_fh {
my $fh = shift;
my $prepend;
my $arg;
my $double_quotes = 0;
do {{
# This makes 10% faster
if(not ($arg = <$fh>)) {
if(not defined ($arg = <$fh>)) {
if(defined $prepend) {
return Arg->new($prepend);
} else {
return undef;
}
}
if($opt::csv) {
# We need to read a full CSV line.
$double_quotes += ($arg =~ y/"/"/);
if($double_quotes % 2) {
# CSV halflines with quoting:
# col1,"col2 2""x3"" board newline <-this one
# cont",col3
$prepend .= $arg;
redo;
} else {
# Now we have a full CSV line
$double_quotes = 0;
}
}
# Remove delimiter
chomp $arg;
if($Global::end_of_file_string and

View file

@ -592,6 +592,21 @@ Use I<prg> for (de)compressing temporary files. It is assumed that I<prg
output) unless B<--decompress-program> is given.
=item B<--csv> (alpha testing)
Treat input as CSV-format. B<--colsep> sets the field delimiter. It
works very much like B<--colsep> except it deals correctly with
quoting:
echo '"1 big, 2 small","2""x4"" plank",12.34' |
parallel --csv echo {1} of {2} at {3}
Even quoted newlines are parsed correctly:
(echo '"Start of field 1 with newline'
echo 'Line 2 in field 1";value 2') |
parallel --csv --colsep ';' echo Field 1: {1} Field 2: {2}
=item B<--delimiter> I<delim>
=item B<-d> I<delim>

View file

@ -847,6 +847,20 @@ par_dryrun_append_joblog() {
wc -l < /tmp/jl.$$
}
par_0_no_newline() {
echo 'A single zero without \n should not be ignored'
echo -n 0 | parallel echo
}
par_csv() {
(echo '"col1""x3""","new'
echo 'line col2","new2'
echo 'line col3",col 4') |
parallel --csv echo {1}-{2}-{3}-{4}
echo '"2""x3"" board","Value with ,",Column 3' |
parallel --csv echo {1}-{2}-{3}
}
export -f $(compgen -A function | grep par_)
compgen -A function | grep par_ | sort |
parallel -j6 --tag -k --joblog +/tmp/jl-`basename $0` '{} 2>&1'

View file

@ -1329,6 +1329,8 @@ echo '### Test --tty'
/dev/tty
### 1 .par file from --files expected
0
par_0_no_newline A single zero without \n should not be ignored
par_0_no_newline 0
par_X_eta_div_zero ### bug #34422: parallel -X --eta crashes with div by zero
par_X_eta_div_zero
par_X_eta_div_zero Computers / CPU cores / Max jobs to run
@ -1360,6 +1362,10 @@ par_blocking_redir stdout
par_colsep_0 bug --colsep 0
par_colsep_0 OK
par_colsep_0 OK
par_csv col1"x3"-new
par_csv line col2-new2
par_csv line col3-col 4
par_csv 2"x3" board-Value with ,-Column 3
par_dryrun_append_joblog --dry-run should not append to joblog
par_dryrun_append_joblog 1
par_dryrun_append_joblog 2