mirror of
https://git.savannah.gnu.org/git/parallel.git
synced 2024-11-22 05:57:54 +00:00
--pipepart --group-by implemented (except column names).
This commit is contained in:
parent
324a9f3a07
commit
aa62104eb5
|
@ -206,7 +206,7 @@ from:tange@gnu.org
|
||||||
to:parallel@gnu.org, bug-parallel@gnu.org
|
to:parallel@gnu.org, bug-parallel@gnu.org
|
||||||
stable-bcc: Jesse Alama <jessealama@fastmail.fm>
|
stable-bcc: Jesse Alama <jessealama@fastmail.fm>
|
||||||
|
|
||||||
Subject: GNU Parallel 20190522 ('') released <<[stable]>>
|
Subject: GNU Parallel 20190522 ('Akihito') released <<[stable]>>
|
||||||
|
|
||||||
GNU Parallel 20190522 ('') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
|
GNU Parallel 20190522 ('') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
|
||||||
|
|
||||||
|
@ -223,6 +223,12 @@ Quote of the month:
|
||||||
|
|
||||||
New in this release:
|
New in this release:
|
||||||
|
|
||||||
|
* --group-by groups lines depending on value of a column. The value can be computed.
|
||||||
|
|
||||||
|
* How to compress (bzip / gzip) a very large text quickly? https://medium.com/@gchandra/how-to-compress-bzip-gzip-a-very-large-text-quickly-27c11f4c6681
|
||||||
|
|
||||||
|
* Simple tutorial to install & use GNU Parallel https://medium.com/@gchandra/simple-tutorial-to-install-use-gnu-parallel-79251120d618
|
||||||
|
|
||||||
* Introducing Parallel into Shell https://petelawson.com/post/parallel-in-shell/
|
* Introducing Parallel into Shell https://petelawson.com/post/parallel-in-shell/
|
||||||
|
|
||||||
* Bug fixes and man page updates.
|
* Bug fixes and man page updates.
|
||||||
|
|
138
src/parallel
138
src/parallel
|
@ -462,7 +462,7 @@ sub pipe_part_files(@) {
|
||||||
}
|
}
|
||||||
my $header = find_header(\$buf,open_or_exit($file));
|
my $header = find_header(\$buf,open_or_exit($file));
|
||||||
# find positions
|
# find positions
|
||||||
my @pos = find_split_positions($file,$Global::blocksize,length $header);
|
my @pos = find_split_positions($file,$Global::blocksize,$header);
|
||||||
# Make @cat_prepends
|
# Make @cat_prepends
|
||||||
my @cat_prepends = ();
|
my @cat_prepends = ();
|
||||||
for(my $i=0; $i<$#pos; $i++) {
|
for(my $i=0; $i<$#pos; $i++) {
|
||||||
|
@ -507,19 +507,23 @@ sub find_split_positions($$$) {
|
||||||
# Input:
|
# Input:
|
||||||
# $file = the file to read
|
# $file = the file to read
|
||||||
# $block = (minimal) --block-size of each chunk
|
# $block = (minimal) --block-size of each chunk
|
||||||
# $headerlen = length of header to be skipped
|
# $header = header to be skipped
|
||||||
# Uses:
|
# Uses:
|
||||||
# $opt::recstart
|
# $opt::recstart
|
||||||
# $opt::recend
|
# $opt::recend
|
||||||
# Returns:
|
# Returns:
|
||||||
# @positions of block start/end
|
# @positions of block start/end
|
||||||
my($file, $block, $headerlen) = @_;
|
my($file, $block, $header) = @_;
|
||||||
|
my $headerlen = length $header;
|
||||||
my $size = -s $file;
|
my $size = -s $file;
|
||||||
if(-b $file) {
|
if(-b $file) {
|
||||||
# $file is a blockdevice
|
# $file is a blockdevice
|
||||||
$size = size_of_block_dev($file);
|
$size = size_of_block_dev($file);
|
||||||
}
|
}
|
||||||
$block = int $block;
|
$block = int $block;
|
||||||
|
if($opt::groupby) {
|
||||||
|
return split_positions_for_group_by($file,$size,$block,$header);
|
||||||
|
}
|
||||||
# The optimal dd blocksize for mint, redhat, solaris, openbsd = 2^17..2^20
|
# The optimal dd blocksize for mint, redhat, solaris, openbsd = 2^17..2^20
|
||||||
# The optimal dd blocksize for freebsd = 2^15..2^17
|
# The optimal dd blocksize for freebsd = 2^15..2^17
|
||||||
my $dd_block_size = 131072; # 2^17
|
my $dd_block_size = 131072; # 2^17
|
||||||
|
@ -563,12 +567,126 @@ sub find_split_positions($$$) {
|
||||||
}
|
}
|
||||||
if($pos[$#pos] != $size) {
|
if($pos[$#pos] != $size) {
|
||||||
# Last splitpoint was not at end of the file: add it
|
# Last splitpoint was not at end of the file: add it
|
||||||
push(@pos,$size);
|
push @pos, $size;
|
||||||
}
|
}
|
||||||
close $fh;
|
close $fh;
|
||||||
return @pos;
|
return @pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sub split_positions_for_group_by($$$$) {
|
||||||
|
my($fh);
|
||||||
|
sub value_at($) {
|
||||||
|
my $pos = shift;
|
||||||
|
if($pos != 0) {
|
||||||
|
seek($fh, $pos-1, 0) || die;
|
||||||
|
# Read half line
|
||||||
|
<$fh>;
|
||||||
|
}
|
||||||
|
# Read full line
|
||||||
|
my $linepos = tell($fh);
|
||||||
|
$_ = <$fh>;
|
||||||
|
if(defined $_) {
|
||||||
|
# Not end of file
|
||||||
|
my @F;
|
||||||
|
if(defined $Global::group_by_column) {
|
||||||
|
$opt::colsep ||= "\t";
|
||||||
|
@F = split /$opt::colsep/, $_;
|
||||||
|
$_ = $F[$Global::group_by_column];
|
||||||
|
}
|
||||||
|
eval $Global::group_by_perlexpr;
|
||||||
|
}
|
||||||
|
return ($_,$linepos);
|
||||||
|
}
|
||||||
|
|
||||||
|
sub binary_search_end($$$) {
|
||||||
|
my ($s,$spos,$epos) = @_;
|
||||||
|
# value_at($spos) == $s
|
||||||
|
# value_at($epos) != $s
|
||||||
|
my $posdif = $epos - $spos;
|
||||||
|
my ($v,$vpos);
|
||||||
|
while($posdif) {
|
||||||
|
($v,$vpos) = value_at($spos+$posdif);
|
||||||
|
if($v eq $s) {
|
||||||
|
$spos = $vpos;
|
||||||
|
$posdif = $epos - $spos;
|
||||||
|
} else {
|
||||||
|
$epos = $vpos;
|
||||||
|
}
|
||||||
|
$posdif = int($posdif/2);
|
||||||
|
}
|
||||||
|
return($v,$vpos);
|
||||||
|
}
|
||||||
|
|
||||||
|
sub binary_search_start($$$) {
|
||||||
|
my ($s,$spos,$epos) = @_;
|
||||||
|
# value_at($spos) != $s
|
||||||
|
# value_at($epos) == $s
|
||||||
|
my $posdif = $epos - $spos;
|
||||||
|
my ($v,$vpos);
|
||||||
|
while($posdif) {
|
||||||
|
($v,$vpos) = value_at($spos+$posdif);
|
||||||
|
if($v eq $s) {
|
||||||
|
$epos = $vpos;
|
||||||
|
} else {
|
||||||
|
$spos = $vpos;
|
||||||
|
$posdif = $epos - $spos;
|
||||||
|
}
|
||||||
|
$posdif = int($posdif/2);
|
||||||
|
}
|
||||||
|
return($v,$vpos);
|
||||||
|
}
|
||||||
|
|
||||||
|
my ($file,$size,$block,$header) = @_;
|
||||||
|
my ($a,$b,$c,$apos,$bpos,$cpos);
|
||||||
|
my @pos;
|
||||||
|
$fh = open_or_exit($file);
|
||||||
|
# Set $Global::group_by_column $Global::group_by_perlexpr
|
||||||
|
group_by_loop($opt::recsep);
|
||||||
|
# $xpos = linestart, $x = value at $xpos, $apos < $bpos < $cpos
|
||||||
|
$apos = length $header;
|
||||||
|
for(($a,$apos) = value_at($apos); $apos < $size;) {
|
||||||
|
push @pos, $apos;
|
||||||
|
$bpos = $apos + $block;
|
||||||
|
($b,$bpos) = value_at($bpos);
|
||||||
|
if(eof($fh)) {
|
||||||
|
push @pos, $size; last;
|
||||||
|
}
|
||||||
|
$cpos = $bpos + $block;
|
||||||
|
($c,$cpos) = value_at($cpos);
|
||||||
|
if($a eq $b) {
|
||||||
|
while($b eq $c) {
|
||||||
|
# Move bpos, cpos a block forward until $a == $b != $c
|
||||||
|
$bpos = $cpos;
|
||||||
|
$cpos += $block;
|
||||||
|
($c,$cpos) = value_at($cpos);
|
||||||
|
if($cpos >= $size) {
|
||||||
|
$cpos = $size;
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# $a == $b != $c
|
||||||
|
# Binary search for $b ending between ($bpos,$cpos)
|
||||||
|
($b,$bpos) = binary_search_end($b,$bpos,$cpos);
|
||||||
|
} else {
|
||||||
|
if($b eq $c) {
|
||||||
|
# $a != $b == $c
|
||||||
|
# Binary search for $b starting between ($apos,$bpos)
|
||||||
|
($b,$bpos) = binary_search_start($b,$apos,$bpos);
|
||||||
|
} else {
|
||||||
|
# $a != $b != $c
|
||||||
|
# Binary search for $b ending between ($bpos,$cpos)
|
||||||
|
($b,$bpos) = binary_search_end($b,$bpos,$cpos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
($a,$apos) = ($b,$bpos);
|
||||||
|
}
|
||||||
|
if($pos[$#pos] != $size) {
|
||||||
|
# Last splitpoint was not at end of the file: add it
|
||||||
|
push @pos, $size;
|
||||||
|
}
|
||||||
|
return @pos;
|
||||||
|
}
|
||||||
|
|
||||||
sub cat_partial($@) {
|
sub cat_partial($@) {
|
||||||
# Efficient command to copy from byte X to byte Y
|
# Efficient command to copy from byte X to byte Y
|
||||||
# Input:
|
# Input:
|
||||||
|
@ -637,6 +755,8 @@ sub group_by_loop($) {
|
||||||
}
|
}
|
||||||
# What is left of $groupby is $perlexpr
|
# What is left of $groupby is $perlexpr
|
||||||
$perlexpr = $groupby;
|
$perlexpr = $groupby;
|
||||||
|
$Global::group_by_perlexpr = $perlexpr;
|
||||||
|
$Global::group_by_column = $col;
|
||||||
|
|
||||||
my $loop = ::spacefree(0,'{
|
my $loop = ::spacefree(0,'{
|
||||||
local $_=COLVALUE;
|
local $_=COLVALUE;
|
||||||
|
@ -1792,7 +1912,7 @@ sub check_invalid_option_combinations() {
|
||||||
::wait_and_exit(255);
|
::wait_and_exit(255);
|
||||||
}
|
}
|
||||||
if($opt::groupby) {
|
if($opt::groupby) {
|
||||||
if(not $opt::pipe) {
|
if(not $opt::pipe and not $opt::pipepart) {
|
||||||
$opt::pipe = 1;
|
$opt::pipe = 1;
|
||||||
}
|
}
|
||||||
if($opt::remove_rec_sep) {
|
if($opt::remove_rec_sep) {
|
||||||
|
@ -1807,12 +1927,6 @@ sub check_invalid_option_combinations() {
|
||||||
::error("--recend is not compatible with --groupby");
|
::error("--recend is not compatible with --groupby");
|
||||||
::wait_and_exit(255);
|
::wait_and_exit(255);
|
||||||
}
|
}
|
||||||
if($opt::pipepart) {
|
|
||||||
# TODO This may be possible to do later
|
|
||||||
# Finding split points might be a bitch though
|
|
||||||
::error("--pipepart is not compatible with --groupby");
|
|
||||||
::wait_and_exit(255);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12558,7 +12672,7 @@ sub main() {
|
||||||
pipe_shard_setup();
|
pipe_shard_setup();
|
||||||
}
|
}
|
||||||
|
|
||||||
if($opt::groupby) {
|
if(not $opt::pipepart and $opt::groupby) {
|
||||||
group_by_stdin_filter();
|
group_by_stdin_filter();
|
||||||
}
|
}
|
||||||
if($opt::eta or $opt::bar or $opt::shuf or $Global::halt_pct) {
|
if($opt::eta or $opt::bar or $opt::shuf or $Global::halt_pct) {
|
||||||
|
|
|
@ -797,8 +797,8 @@ See also: B<--line-buffer> B<--ungroup>
|
||||||
|
|
||||||
=item B<--group-by> I<val> (alpha testing)
|
=item B<--group-by> I<val> (alpha testing)
|
||||||
|
|
||||||
Group input by value. Combined with B<--pipe> B<--group-by> groups
|
Group input by value. Combined with B<--pipe>/B<--pipepart>
|
||||||
lines with the same value into a record.
|
B<--group-by> groups lines with the same value into a record.
|
||||||
|
|
||||||
The value can be computed from the full line or from a single column.
|
The value can be computed from the full line or from a single column.
|
||||||
|
|
||||||
|
@ -815,6 +815,8 @@ Use the value in the column numbered.
|
||||||
Treat the first line as a header and use the value in the column
|
Treat the first line as a header and use the value in the column
|
||||||
named.
|
named.
|
||||||
|
|
||||||
|
(Not supported with B<--pipepart>).
|
||||||
|
|
||||||
=item Z<> perl expression
|
=item Z<> perl expression
|
||||||
|
|
||||||
Run the perl expression and use $_ as the value.
|
Run the perl expression and use $_ as the value.
|
||||||
|
@ -827,17 +829,19 @@ Put the value of the column put in $_, run the perl expression, and use $_ as th
|
||||||
|
|
||||||
Put the value of the column put in $_, run the perl expression, and use $_ as the value.
|
Put the value of the column put in $_, run the perl expression, and use $_ as the value.
|
||||||
|
|
||||||
|
(Not supported with B<--pipepart>).
|
||||||
|
|
||||||
=back
|
=back
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
UserID, Consumption
|
UserID, Consumption
|
||||||
123, 1
|
123, 1
|
||||||
123, 2
|
123, 2
|
||||||
12-3, 1
|
12-3, 1
|
||||||
221, 3
|
221, 3
|
||||||
221, 1
|
221, 1
|
||||||
2/21, 5
|
2/21, 5
|
||||||
|
|
||||||
If you want to group 123, 12-3, 221, and 2/21 into 4 records and pass
|
If you want to group 123, 12-3, 221, and 2/21 into 4 records and pass
|
||||||
one record at a time to B<wc>:
|
one record at a time to B<wc>:
|
||||||
|
@ -861,7 +865,7 @@ UserID when grouping:
|
||||||
cat table.csv | parallel --pipe --colsep , --header : \
|
cat table.csv | parallel --pipe --colsep , --header : \
|
||||||
--group-by 'UserID s/\D//g' -kN1 wc
|
--group-by 'UserID s/\D//g' -kN1 wc
|
||||||
|
|
||||||
See also B<--shard>.
|
See also B<--shard>, B<--roundrobin>.
|
||||||
|
|
||||||
|
|
||||||
=item B<--help>
|
=item B<--help>
|
||||||
|
@ -1975,6 +1979,8 @@ impossible to track which input block corresponds to which output.
|
||||||
|
|
||||||
B<--roundrobin> implies B<--pipe>, except if B<--pipepart> is given.
|
B<--roundrobin> implies B<--pipe>, except if B<--pipepart> is given.
|
||||||
|
|
||||||
|
See also B<--group-by>, B<--shard>.
|
||||||
|
|
||||||
|
|
||||||
=item B<--rpl> 'I<tag> I<perl expression>'
|
=item B<--rpl> 'I<tag> I<perl expression>'
|
||||||
|
|
||||||
|
@ -2164,6 +2170,8 @@ I<shardkey> is small (<10), slower if it is big (>100).
|
||||||
|
|
||||||
B<--shard> requires B<--pipe> and a fixed numeric value for B<--jobs>.
|
B<--shard> requires B<--pipe> and a fixed numeric value for B<--jobs>.
|
||||||
|
|
||||||
|
See also B<--group-by>, B<--roundrobin>.
|
||||||
|
|
||||||
|
|
||||||
=item B<--shebang>
|
=item B<--shebang>
|
||||||
|
|
||||||
|
@ -4462,6 +4470,46 @@ files:
|
||||||
$ tracefile -un ./configure | tail | parallel -j0 apt-file search
|
$ tracefile -un ./configure | tail | parallel -j0 apt-file search
|
||||||
|
|
||||||
|
|
||||||
|
=head1 SPREADING BLOCKS OF DATA
|
||||||
|
|
||||||
|
B<--round-robin>, B<--pipe-part>, B<--shard>, and B<--group-by> are
|
||||||
|
all specialized versions of B<--pipe>.
|
||||||
|
|
||||||
|
In the following I<n> is the number of jobslots given by B<--jobs>. A
|
||||||
|
record starts with B<--recstart> and ends with B<--recend>. It is
|
||||||
|
typically a full line. A chunk is a number of full records that is
|
||||||
|
approximately the size of a block. A block can contain half records, a
|
||||||
|
chunk cannot.
|
||||||
|
|
||||||
|
B<--pipe> starts one job per chunk. It reads blocks from stdin
|
||||||
|
(standard input). It finds a record end near a block border and passes
|
||||||
|
a chunk to the program.
|
||||||
|
|
||||||
|
B<--pipe-part> starts one job per chunk - just like normal
|
||||||
|
B<--pipe>. It first finds record endings near all block borders in the
|
||||||
|
file and then starts the jobs. By using B<--block -1> it will set the
|
||||||
|
block size to 1/I<n> * size-of-file. Used this way it will start I<n>
|
||||||
|
jobs in total.
|
||||||
|
|
||||||
|
B<--round-robin> starts I<n> jobs in total. It reads a block and
|
||||||
|
passes a chunk to whichever job is ready to read. It does not parse
|
||||||
|
the content except for identifying where a record ends to make sure it
|
||||||
|
only passes full records.
|
||||||
|
|
||||||
|
B<--shard> starts I<n> jobs in total. It parses each line to read the
|
||||||
|
value in the given column. Based on this value the line is passed to
|
||||||
|
one of the I<n> jobs. All lines having this value will be given to the
|
||||||
|
same jobslot.
|
||||||
|
|
||||||
|
B<--group-by> starts one job per chunk. Record borders are not given
|
||||||
|
by B<--recend>/B<--recstart>. Instead a record is defined by a number
|
||||||
|
of lines having the same value in a given column. So the value of a
|
||||||
|
given column changes at a chunk border. With B<--pipe> every line is
|
||||||
|
parsed, with B<--pipe-part> only a few lines are parsed to find the
|
||||||
|
chunk border.
|
||||||
|
|
||||||
|
B<--group-by> can be combined with B<--round-robin> or B<--pipe-part>.
|
||||||
|
|
||||||
=head1 QUOTING
|
=head1 QUOTING
|
||||||
|
|
||||||
GNU B<parallel> is very liberal in quoting. You only need to quote
|
GNU B<parallel> is very liberal in quoting. You only need to quote
|
||||||
|
|
|
@ -189,5 +189,13 @@ par_test_build_and_install() {
|
||||||
sudo parallel mv {} {.}
|
sudo parallel mv {} {.}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#par_crashing() {
|
||||||
|
# echo '### bug #56322: sem crashed when running with input from seq'
|
||||||
|
# echo "### This should not fail"
|
||||||
|
# doit() { seq 100000000 |xargs -P 80 -n 1 sem true; }
|
||||||
|
# export -f doit
|
||||||
|
# parallel -j1 --timeout 100 --nice 11 doit ::: 1
|
||||||
|
#}
|
||||||
|
|
||||||
export -f $(compgen -A function | grep par_)
|
export -f $(compgen -A function | grep par_)
|
||||||
compgen -A function | grep par_ | sort | parallel -vj0 -k --tag --joblog /tmp/jl-`basename $0` '{} 2>&1'
|
compgen -A function | grep par_ | sort | parallel -vj0 -k --tag --joblog /tmp/jl-`basename $0` '{} 2>&1'
|
||||||
|
|
Loading…
Reference in a new issue