parallel: --line-buffer memory usage changed from 2x to 1x.

This commit is contained in:
Ole Tange 2019-02-24 01:16:40 +01:00
parent fd4694c66b
commit 5e1e3775fc
11 changed files with 154 additions and 26 deletions

View file

@ -2,10 +2,6 @@ Quote of the month:
With GNU Parallel you sure can!
I like getting things done
--Kyle Lady @kylelady@twitter
@ -37,7 +33,12 @@ It's the MapReduce of our generation!
=== Used ===
Ok! GNU Parallel is one of the best things out there. Almost as good as vanilla ice cream.
With GNU Parallel you sure can!
I like getting things done
--Kyle Lady @kylelady@twitter
Ok! GNU Parallel is one of the best things out there. Almost as good as vanilla ice cream.
-- @coffe@mastodon.art
HOLY STUFF I LOVE GNU PARALLEL

View file

@ -23,7 +23,7 @@
use strict;
use Getopt::Long;
$Global::progname="niceload";
$Global::version = 20190222;
$Global::version = 20190223;
Getopt::Long::Configure("bundling","require_order");
get_options_from_array(\@ARGV) || die_usage();
if($opt::version) {

View file

@ -200,13 +200,10 @@ sub pipe_tee_setup() {
sub parcat_script() {
# TODO if script fails: Use parallel -j0 --plain --lb cat ::: fifos
my $script = q'{
use Symbol qw(gensym);
use IPC::Open3;
use POSIX qw(:errno_h);
use IO::Select;
use strict;
use threads;
use threads::shared;
use Thread::Queue;
use Fcntl qw(:DEFAULT :flock);
@ -369,7 +366,7 @@ sub parcat_script() {
fcntl($fh, &F_SETFL, $flags) || die $!; # Set the flags on the filehandle
}
}';
return ::spacefree(1, $script);
return ::spacefree(3, $script);
}
sub sharder_script() {
@ -432,7 +429,7 @@ sub pipe_shard_setup() {
open STDOUT, ">","/dev/null";
# The PERL_HASH_SEED must be the same for all sharders
# so B::hash will return the same value for any given input
$ENV{PERL_HASH_SEED} = $$;
$ENV{'PERL_HASH_SEED'} = $$;
exec qw(parallel --block 100k -q --pipe -j), $njobs,
qw(--roundrobin -u perl -e), $script, ($opt::colsep || ","),
$opt::shard, '{}', (map { (':::+', @{$_}) } @parcatfifos);
@ -1698,7 +1695,7 @@ sub check_invalid_option_combinations() {
sub init_globals() {
# Defaults:
$Global::version = 20190222;
$Global::version = 20190223;
$Global::progname = 'parallel';
$Global::infinity = 2**31;
$Global::debug = 0;
@ -5054,6 +5051,10 @@ sub spacefree($$) {
# Keep newlines
$s =~ s/\n\n+/\n/sg;
$s =~ s/[ \t]+/ /mg;
} elsif(3 == $spaces) {
# Keep perl code required space
$s =~ s{([^a-zA-Z0-9/])\s+}{$1}sg;
$s =~ s{([a-zA-Z0-9/])\s+([^:a-zA-Z0-9/])}{$1$2}sg;
} else {
$s =~ s/\s//mg;
}
@ -8516,7 +8517,7 @@ sub sshlogin_wrap($) {
} else {
$bashfuncset = '$bashfunc = "";'
}
if($ENV{"parallel_bash_environment"}) {
if($ENV{'parallel_bash_environment'}) {
$bashfuncset .= '$bashfunc .= "eval\ \"\$parallel_bash_environment\"\;";';
}
::debug("base64",$envset,$bashfuncset,"\n");
@ -9527,7 +9528,11 @@ sub print_linebuffer($) {
# read remaining
my $halfline_ref = $self->{'halfline'}{$fdno};
if(grep /./, @$halfline_ref) {
$self->add_returnsize(length join("",@$halfline_ref));
my $returnsize = 0;
for(@{$self->{'halfline'}{$fdno}}) {
$returnsize += length $_;
}
$self->add_returnsize($returnsize);
if($opt::tag or defined $opt::tagstring) {
# Prepend $tag the the remaining half line
unshift @$halfline_ref, $self->tag();

View file

@ -2148,7 +2148,7 @@ E.g.
B<--shebang-wrap> must be set as the first option.
=item B<--shellquote> (alpha testing)
=item B<--shellquote> (beta testing)
Does not run the command but quotes it. Useful for making quoted
composed commands for GNU B<parallel>.

View file

@ -1959,6 +1959,46 @@ https://github.com/codingo/Interlace can be run with GNU B<parallel>:
https://github.com/codingo/Interlace (Last checked: 2019-02)
=head2 DIFFERENCES BETWEEN otonvm Parallel AND GNU Parallel
I have been unable to get the code to run at all. It seems unfinished.
https://github.com/otonvm/Parallel (Last checked: 2019-02)
=head2 DIFFERENCES BETWEEN k-bx par AND GNU Parallel
B<par> requires Haskell to work. This limits the number of platforms
this can work on.
B<par> does line buffering in memory. The memory usage is 3x the
longest line (compared to 1x for B<parallel --lb>). Commands must be
given as arguments. There is no template.
These are the examples from https://github.com/k-bx/par with the
corresponding GNU B<parallel> command.
par "echo foo; sleep 1; echo foo; sleep 1; echo foo" \
"echo bar; sleep 1; echo bar; sleep 1; echo bar" && echo "success"
parallel --lb ::: "echo foo; sleep 1; echo foo; sleep 1; echo foo" \
"echo bar; sleep 1; echo bar; sleep 1; echo bar" && echo "success"
par "echo foo; sleep 1; foofoo" \
"echo bar; sleep 1; echo bar; sleep 1; echo bar" && echo "success"
parallel --lb --halt 1 ::: "echo foo; sleep 1; foofoo" \
"echo bar; sleep 1; echo bar; sleep 1; echo bar" && echo "success"
par "PARPREFIX=[fooechoer] echo foo" "PARPREFIX=[bar] echo bar"
parallel --lb --colsep , --tagstring {1} {2} \
::: "[fooechoer],echo foo" "[bar],echo bar"
par --succeed "foo" "bar" && echo 'wow'
parallel "foo" "bar"; true && echo 'wow'
https://github.com/k-bx/par (Last checked: 2019-02)
=head2 Todo
Url for spread
@ -1979,13 +2019,6 @@ https://github.com/xuchenCN/go-pssh
https://github.com/amritb/with-this.git
https://github.com/fd0/machma Requires Go >= 1.7.
https://github.com/k-bx/par requires Haskell to work. This limits the
number of platforms this can work on.
https://github.com/otonvm/Parallel
https://github.com/flesler/parallel
https://github.com/Julian/Verge

View file

@ -20,6 +20,14 @@ a single file: No need to mess around with environment variables like
PERL5LIB.
=head2 Interpreted language
GNU B<parallel> is designed to be able to run on old systems. That
means that it cannot depend on a compiler being installed - and
especially not a compiler for a language that is younger than 20 years
old.
=head2 Old Perl style
GNU B<parallel> uses some old, deprecated constructs. This is due to a
@ -526,6 +534,63 @@ The real killer comes when you try to combine several of these: Doing
that correctly for all corner cases is next to impossible to do by
hand.
=head2 --shard
The simple way to implement sharding would be to:
=over 5
=item 1
start n jobs,
=item 2
split each line into columns,
=item 3
select the data from the relevant column
=item 4
compute a hash value from the data
=item 5
take the modulo n of the hash value
=item 6
pass the full line to the jobslot that has the computed value
=back
Unfortunately Perl is rather slow at computing the hash value (and
somewhat slow at splitting into columns).
One solution is to use a compiled language for the splitting and
hashing, but that would go against the design criteria of not
depending on a compiler.
Luckily those tasks can be parallelized. So GNU B<parallel> starts n
sharders that do step 2-6, and passes blocks of 100k to each of those
in a round robin manner. To make sure these sharders compute the hash
the same way, $PERL_HASH_SEED is set to the same value for all sharders.
Running n sharders poses a new problem: Instead of having n outputs
(one for each computed value) you now have n outputs for each of the n
values, so in total n*n outputs; and you need to merge these n*n
outputs together into n outputs.
This can be done by simply running 'parallel -j0 --lb cat :::
outputs_for_one_value', but that is rather inefficient, as it spawns a
process for each file. Instead the core code from 'parcat' is run,
which is also a bit faster.
All the sharders and parcats communicate through named pipes that are
unlinked as soon as they are opened.
=head2 Shell shock

View file

@ -574,7 +574,7 @@ $Global::Initfile && unlink $Global::Initfile;
exit ($err);
sub parse_options {
$Global::version = 20190222;
$Global::version = 20190223;
$Global::progname = 'sql';
# This must be done first as this may exec myself

View file

@ -275,6 +275,27 @@ par_test_diff_roundrobin_k() {
fi
}
par_lb_mem_usage() {
long_line() {
perl -e 'print "x"x100_000_000'
}
export -f long_line
memusage() {
round=$1
shift
/usr/bin/time -v "$@" 2>&1 >/dev/null |
perl -ne '/Maximum resident set size .kbytes.: (\d+)/ and print $1,"\n"' |
perl -pe '$_ = int($_/'$round')."\n"'
}
# 1 line - RAM usage 1 x 100 MB
memusage 100000 parallel --lb ::: long_line
# 2 lines - RAM usage 1 x 100 MB
memusage 100000 parallel --lb ::: 'long_line; echo; long_line'
# 1 double length line - RAM usage 2 x 100 MB
memusage 100000 parallel --lb ::: 'long_line; long_line'
}
export -f $(compgen -A function | grep par_)
compgen -A function | grep par_ | LC_ALL=C sort |
parallel -j6 --tag -k --joblog /tmp/jl-`basename $0` '{} 2>&1'

View file

@ -27,8 +27,8 @@ par_interactive sleep 0.1; echo opt-p 2 ?...n
par_interactive sleep 0.1; echo opt-p 3 ?...y
par_interactive spawn /tmp/parallel-script-for-expect
par_k ### Test -k
par_k parallel: Warning: Only enough file handles to run 8 jobs in parallel.
par_k parallel: Warning: Running 'parallel -j0 -N 8 --pipe parallel -j0' or
par_k parallel: Warning: Only enough file handles to run 9 jobs in parallel.
par_k parallel: Warning: Running 'parallel -j0 -N 9 --pipe parallel -j0' or
par_k parallel: Warning: raising 'ulimit -n' or 'nofile' in /etc/security/limits.conf
par_k parallel: Warning: or /proc/sys/fs/file-max may help.
par_k begin

View file

@ -62,6 +62,9 @@ par_kill_term_twice parallel: bash -c 'sleep 120 & pid=$!; wait $pid' 1
par_kill_term_twice bash-+-perl---bash---sleep
par_kill_term_twice `-pstree
par_kill_term_twice bash---pstree
par_lb_mem_usage 1
par_lb_mem_usage 1
par_lb_mem_usage 2
par_multiline_commands bug #50781: joblog format with multiline commands
par_multiline_commands 1
par_multiline_commands finish 1

View file

@ -61,7 +61,7 @@ echo '### Check that 4 processes are really used'
echo '### --version must have higher priority than retired options'
### --version must have higher priority than retired options
$NICEPAR --version -g -Y -U -W -T | tail
GNU parallel 20190123
GNU parallel 20190223
Copyright (C) 2007-2019 Ole Tange and Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.