parallel: --line-buffer memory usage changed from 2x to 1x.

This commit is contained in:
Ole Tange 2019-02-24 01:16:40 +01:00
parent fd4694c66b
commit 5e1e3775fc
11 changed files with 154 additions and 26 deletions

View file

@ -2,10 +2,6 @@ Quote of the month:
With GNU Parallel you sure can!
I like getting things done
--Kyle Lady @kylelady@twitter
@ -37,6 +33,11 @@ It's the MapReduce of our generation!
=== Used === === Used ===
With GNU Parallel you sure can!
I like getting things done
--Kyle Lady @kylelady@twitter
Ok! GNU Parallel is one of the best things out there. Almost as good as vanilla ice cream. Ok! GNU Parallel is one of the best things out there. Almost as good as vanilla ice cream.
-- @coffe@mastodon.art -- @coffe@mastodon.art

View file

@ -23,7 +23,7 @@
use strict; use strict;
use Getopt::Long; use Getopt::Long;
$Global::progname="niceload"; $Global::progname="niceload";
$Global::version = 20190222; $Global::version = 20190223;
Getopt::Long::Configure("bundling","require_order"); Getopt::Long::Configure("bundling","require_order");
get_options_from_array(\@ARGV) || die_usage(); get_options_from_array(\@ARGV) || die_usage();
if($opt::version) { if($opt::version) {

View file

@ -200,13 +200,10 @@ sub pipe_tee_setup() {
sub parcat_script() { sub parcat_script() {
# TODO if script fails: Use parallel -j0 --plain --lb cat ::: fifos # TODO if script fails: Use parallel -j0 --plain --lb cat ::: fifos
my $script = q'{ my $script = q'{
use Symbol qw(gensym);
use IPC::Open3;
use POSIX qw(:errno_h); use POSIX qw(:errno_h);
use IO::Select; use IO::Select;
use strict; use strict;
use threads; use threads;
use threads::shared;
use Thread::Queue; use Thread::Queue;
use Fcntl qw(:DEFAULT :flock); use Fcntl qw(:DEFAULT :flock);
@ -369,7 +366,7 @@ sub parcat_script() {
fcntl($fh, &F_SETFL, $flags) || die $!; # Set the flags on the filehandle fcntl($fh, &F_SETFL, $flags) || die $!; # Set the flags on the filehandle
} }
}'; }';
return ::spacefree(1, $script); return ::spacefree(3, $script);
} }
sub sharder_script() { sub sharder_script() {
@ -432,7 +429,7 @@ sub pipe_shard_setup() {
open STDOUT, ">","/dev/null"; open STDOUT, ">","/dev/null";
# The PERL_HASH_SEED must be the same for all sharders # The PERL_HASH_SEED must be the same for all sharders
# so B::hash will return the same value for any given input # so B::hash will return the same value for any given input
$ENV{PERL_HASH_SEED} = $$; $ENV{'PERL_HASH_SEED'} = $$;
exec qw(parallel --block 100k -q --pipe -j), $njobs, exec qw(parallel --block 100k -q --pipe -j), $njobs,
qw(--roundrobin -u perl -e), $script, ($opt::colsep || ","), qw(--roundrobin -u perl -e), $script, ($opt::colsep || ","),
$opt::shard, '{}', (map { (':::+', @{$_}) } @parcatfifos); $opt::shard, '{}', (map { (':::+', @{$_}) } @parcatfifos);
@ -1698,7 +1695,7 @@ sub check_invalid_option_combinations() {
sub init_globals() { sub init_globals() {
# Defaults: # Defaults:
$Global::version = 20190222; $Global::version = 20190223;
$Global::progname = 'parallel'; $Global::progname = 'parallel';
$Global::infinity = 2**31; $Global::infinity = 2**31;
$Global::debug = 0; $Global::debug = 0;
@ -5054,6 +5051,10 @@ sub spacefree($$) {
# Keep newlines # Keep newlines
$s =~ s/\n\n+/\n/sg; $s =~ s/\n\n+/\n/sg;
$s =~ s/[ \t]+/ /mg; $s =~ s/[ \t]+/ /mg;
} elsif(3 == $spaces) {
# Keep perl code required space
$s =~ s{([^a-zA-Z0-9/])\s+}{$1}sg;
$s =~ s{([a-zA-Z0-9/])\s+([^:a-zA-Z0-9/])}{$1$2}sg;
} else { } else {
$s =~ s/\s//mg; $s =~ s/\s//mg;
} }
@ -8516,7 +8517,7 @@ sub sshlogin_wrap($) {
} else { } else {
$bashfuncset = '$bashfunc = "";' $bashfuncset = '$bashfunc = "";'
} }
if($ENV{"parallel_bash_environment"}) { if($ENV{'parallel_bash_environment'}) {
$bashfuncset .= '$bashfunc .= "eval\ \"\$parallel_bash_environment\"\;";'; $bashfuncset .= '$bashfunc .= "eval\ \"\$parallel_bash_environment\"\;";';
} }
::debug("base64",$envset,$bashfuncset,"\n"); ::debug("base64",$envset,$bashfuncset,"\n");
@ -9527,7 +9528,11 @@ sub print_linebuffer($) {
# read remaining # read remaining
my $halfline_ref = $self->{'halfline'}{$fdno}; my $halfline_ref = $self->{'halfline'}{$fdno};
if(grep /./, @$halfline_ref) { if(grep /./, @$halfline_ref) {
$self->add_returnsize(length join("",@$halfline_ref)); my $returnsize = 0;
for(@{$self->{'halfline'}{$fdno}}) {
$returnsize += length $_;
}
$self->add_returnsize($returnsize);
if($opt::tag or defined $opt::tagstring) { if($opt::tag or defined $opt::tagstring) {
# Prepend $tag the the remaining half line # Prepend $tag the the remaining half line
unshift @$halfline_ref, $self->tag(); unshift @$halfline_ref, $self->tag();

View file

@ -2148,7 +2148,7 @@ E.g.
B<--shebang-wrap> must be set as the first option. B<--shebang-wrap> must be set as the first option.
=item B<--shellquote> (alpha testing) =item B<--shellquote> (beta testing)
Does not run the command but quotes it. Useful for making quoted Does not run the command but quotes it. Useful for making quoted
composed commands for GNU B<parallel>. composed commands for GNU B<parallel>.

View file

@ -1959,6 +1959,46 @@ https://github.com/codingo/Interlace can be run with GNU B<parallel>:
https://github.com/codingo/Interlace (Last checked: 2019-02) https://github.com/codingo/Interlace (Last checked: 2019-02)
=head2 DIFFERENCES BETWEEN otonvm Parallel AND GNU Parallel
I have been unable to get the code to run at all. It seems unfinished.
https://github.com/otonvm/Parallel (Last checked: 2019-02)
=head2 DIFFERENCES BETWEEN k-bx par AND GNU Parallel
B<par> requires Haskell to work. This limits the number of platforms
this can work on.
B<par> does line buffering in memory. The memory usage is 3x the
longest line (compared to 1x for B<parallel --lb>). Commands must be
given as arguments. There is no template.
These are the examples from https://github.com/k-bx/par with the
corresponding GNU B<parallel> command.
par "echo foo; sleep 1; echo foo; sleep 1; echo foo" \
"echo bar; sleep 1; echo bar; sleep 1; echo bar" && echo "success"
parallel --lb ::: "echo foo; sleep 1; echo foo; sleep 1; echo foo" \
"echo bar; sleep 1; echo bar; sleep 1; echo bar" && echo "success"
par "echo foo; sleep 1; foofoo" \
"echo bar; sleep 1; echo bar; sleep 1; echo bar" && echo "success"
parallel --lb --halt 1 ::: "echo foo; sleep 1; foofoo" \
"echo bar; sleep 1; echo bar; sleep 1; echo bar" && echo "success"
par "PARPREFIX=[fooechoer] echo foo" "PARPREFIX=[bar] echo bar"
parallel --lb --colsep , --tagstring {1} {2} \
::: "[fooechoer],echo foo" "[bar],echo bar"
par --succeed "foo" "bar" && echo 'wow'
parallel "foo" "bar"; true && echo 'wow'
https://github.com/k-bx/par (Last checked: 2019-02)
=head2 Todo =head2 Todo
Url for spread Url for spread
@ -1979,13 +2019,6 @@ https://github.com/xuchenCN/go-pssh
https://github.com/amritb/with-this.git https://github.com/amritb/with-this.git
https://github.com/fd0/machma Requires Go >= 1.7.
https://github.com/k-bx/par requires Haskell to work. This limits the
number of platforms this can work on.
https://github.com/otonvm/Parallel
https://github.com/flesler/parallel https://github.com/flesler/parallel
https://github.com/Julian/Verge https://github.com/Julian/Verge

View file

@ -20,6 +20,14 @@ a single file: No need to mess around with environment variables like
PERL5LIB. PERL5LIB.
=head2 Interpreted language
GNU B<parallel> is designed to be able to run on old systems. That
means that it cannot depend on a compiler being installed - and
especially not a compiler for a language that is younger than 20 years
old.
=head2 Old Perl style =head2 Old Perl style
GNU B<parallel> uses some old, deprecated constructs. This is due to a GNU B<parallel> uses some old, deprecated constructs. This is due to a
@ -526,6 +534,63 @@ The real killer comes when you try to combine several of these: Doing
that correctly for all corner cases is next to impossible to do by that correctly for all corner cases is next to impossible to do by
hand. hand.
=head2 --shard
The simple way to implement sharding would be to:
=over 5
=item 1
start n jobs,
=item 2
split each line into columns,
=item 3
select the data from the relevant column
=item 4
compute a hash value from the data
=item 5
take the modulo n of the hash value
=item 6
pass the full line to the jobslot that has the computed value
=back
Unfortunately Perl is rather slow at computing the hash value (and
somewhat slow at splitting into columns).
One solution is to use a compiled language for the splitting and
hashing, but that would go against the design criteria of not
depending on a compiler.
Luckily those tasks can be parallelized. So GNU B<parallel> starts n
sharders that do step 2-6, and passes blocks of 100k to each of those
in a round robin manner. To make sure these sharders compute the hash
the same way, $PERL_HASH_SEED is set to the same value for all sharders.
Running n sharders poses a new problem: Instead of having n outputs
(one for each computed value) you now have n outputs for each of the n
values, so in total n*n outputs; and you need to merge these n*n
outputs together into n outputs.
This can be done by simply running 'parallel -j0 --lb cat :::
outputs_for_one_value', but that is rather inefficient, as it spawns a
process for each file. Instead the core code from 'parcat' is run,
which is also a bit faster.
All the sharders and parcats communicate through named pipes that are
unlinked as soon as they are opened.
=head2 Shell shock =head2 Shell shock

View file

@ -574,7 +574,7 @@ $Global::Initfile && unlink $Global::Initfile;
exit ($err); exit ($err);
sub parse_options { sub parse_options {
$Global::version = 20190222; $Global::version = 20190223;
$Global::progname = 'sql'; $Global::progname = 'sql';
# This must be done first as this may exec myself # This must be done first as this may exec myself

View file

@ -275,6 +275,27 @@ par_test_diff_roundrobin_k() {
fi fi
} }
par_lb_mem_usage() {
long_line() {
perl -e 'print "x"x100_000_000'
}
export -f long_line
memusage() {
round=$1
shift
/usr/bin/time -v "$@" 2>&1 >/dev/null |
perl -ne '/Maximum resident set size .kbytes.: (\d+)/ and print $1,"\n"' |
perl -pe '$_ = int($_/'$round')."\n"'
}
# 1 line - RAM usage 1 x 100 MB
memusage 100000 parallel --lb ::: long_line
# 2 lines - RAM usage 1 x 100 MB
memusage 100000 parallel --lb ::: 'long_line; echo; long_line'
# 1 double length line - RAM usage 2 x 100 MB
memusage 100000 parallel --lb ::: 'long_line; long_line'
}
export -f $(compgen -A function | grep par_) export -f $(compgen -A function | grep par_)
compgen -A function | grep par_ | LC_ALL=C sort | compgen -A function | grep par_ | LC_ALL=C sort |
parallel -j6 --tag -k --joblog /tmp/jl-`basename $0` '{} 2>&1' parallel -j6 --tag -k --joblog /tmp/jl-`basename $0` '{} 2>&1'

View file

@ -27,8 +27,8 @@ par_interactive sleep 0.1; echo opt-p 2 ?...n
par_interactive sleep 0.1; echo opt-p 3 ?...y par_interactive sleep 0.1; echo opt-p 3 ?...y
par_interactive spawn /tmp/parallel-script-for-expect par_interactive spawn /tmp/parallel-script-for-expect
par_k ### Test -k par_k ### Test -k
par_k parallel: Warning: Only enough file handles to run 8 jobs in parallel. par_k parallel: Warning: Only enough file handles to run 9 jobs in parallel.
par_k parallel: Warning: Running 'parallel -j0 -N 8 --pipe parallel -j0' or par_k parallel: Warning: Running 'parallel -j0 -N 9 --pipe parallel -j0' or
par_k parallel: Warning: raising 'ulimit -n' or 'nofile' in /etc/security/limits.conf par_k parallel: Warning: raising 'ulimit -n' or 'nofile' in /etc/security/limits.conf
par_k parallel: Warning: or /proc/sys/fs/file-max may help. par_k parallel: Warning: or /proc/sys/fs/file-max may help.
par_k begin par_k begin

View file

@ -62,6 +62,9 @@ par_kill_term_twice parallel: bash -c 'sleep 120 & pid=$!; wait $pid' 1
par_kill_term_twice bash-+-perl---bash---sleep par_kill_term_twice bash-+-perl---bash---sleep
par_kill_term_twice `-pstree par_kill_term_twice `-pstree
par_kill_term_twice bash---pstree par_kill_term_twice bash---pstree
par_lb_mem_usage 1
par_lb_mem_usage 1
par_lb_mem_usage 2
par_multiline_commands bug #50781: joblog format with multiline commands par_multiline_commands bug #50781: joblog format with multiline commands
par_multiline_commands 1 par_multiline_commands 1
par_multiline_commands finish 1 par_multiline_commands finish 1

View file

@ -61,7 +61,7 @@ echo '### Check that 4 processes are really used'
echo '### --version must have higher priority than retired options' echo '### --version must have higher priority than retired options'
### --version must have higher priority than retired options ### --version must have higher priority than retired options
$NICEPAR --version -g -Y -U -W -T | tail $NICEPAR --version -g -Y -U -W -T | tail
GNU parallel 20190123 GNU parallel 20190223
Copyright (C) 2007-2019 Ole Tange and Free Software Foundation, Inc. Copyright (C) 2007-2019 Ole Tange and Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html> License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it. This is free software: you are free to change and redistribute it.