Faster version bug #45479: --pipe/--pipepart --tee.

This commit is contained in:
Ole Tange 2017-02-05 23:53:28 +01:00
parent 70006774e8
commit a2358aebe4
8 changed files with 247 additions and 180 deletions

View file

@ -51,8 +51,17 @@ my @command = @ARGV;
my @input_source_fh;
if($opt::pipepart) {
if($opt::tee) {
@input_source_fh = map { open_or_exit($_) } @opt::a;
# Remove the first: It will be the file piped.
shift @input_source_fh;
if(not @input_source_fh and not $opt::pipe) {
@input_source_fh = (*STDIN);
}
} else {
# -a is used for data - not for command line args
@input_source_fh = map { open_or_exit($_) } "/dev/null";
}
} else {
@input_source_fh = map { open_or_exit($_) } @opt::a;
if(not @input_source_fh and not $opt::pipe) {
@ -77,6 +86,65 @@ if($opt::skip_first_line) {
set_input_source_header();
if($opt::filter_hosts and (@opt::sshlogin or @opt::sshloginfile)) {
# Parallel check all hosts are up. Remove hosts that are down
filter_hosts();
}
if($opt::nonall or $opt::onall) {
onall(\@input_source_fh,@command);
wait_and_exit(min(undef_as_zero($Global::exitstatus),254));
}
# TODO --transfer foo/./bar --cleanup
# multiple --transfer and --basefile with different /./
$Global::JobQueue = JobQueue->new(
\@command,\@input_source_fh,$Global::ContextReplace,
$number_of_args,\@Global::transfer_files,\@Global::ret_files);
if($opt::pipepart) {
pipepart_setup();
} elsif($opt::pipe and $opt::tee) {
pipe_tee_setup();
}
if($opt::eta or $opt::bar or $opt::shuf or $Global::halt_pct) {
# Count the number of jobs or shuffle all jobs
# before starting any.
# Must be done after ungetting any --pipepart jobs.
$Global::JobQueue->total_jobs();
}
# Compute $Global::max_jobs_running
# Must be done after ungetting any --pipepart jobs.
max_jobs_running();
init_run_jobs();
my $sem;
if($Global::semaphore) {
$sem = acquire_semaphore();
}
$SIG{TERM} = \&start_no_new_jobs;
start_more_jobs();
if($opt::tee) {
# All jobs must be running in parallel for --tee
$Global::start_no_new_jobs = 1;
} elsif($opt::pipe and not $opt::pipepart) {
spreadstdin();
}
::debug("init", "Start draining\n");
drain_job_queue();
::debug("init", "Done draining\n");
reaper();
::debug("init", "Done reaping\n");
::debug("init", "Cleaning\n");
if($Global::semaphore) {
$sem->release();
}
cleanup();
::debug("init", "Halt\n");
halt();
sub set_input_source_header {
if($opt::header and not $opt::pipe) {
# split with colsep or \t
@ -117,24 +185,44 @@ sub set_input_source_header {
}
}
if($opt::filter_hosts and (@opt::sshlogin or @opt::sshloginfile)) {
# Parallel check all hosts are up. Remove hosts that are down
filter_hosts();
sub max_jobs_running {
if(not $Global::max_jobs_running) {
# Compute $Global::max_jobs_running
for my $sshlogin (values %Global::host) {
$sshlogin->max_jobs_running();
}
}
return $Global::max_jobs_running;
}
if($opt::nonall or $opt::onall) {
onall(\@input_source_fh,@command);
wait_and_exit(min(undef_as_zero($Global::exitstatus),254));
sub halt {
if($opt::halt and $Global::halt_when ne "never") {
if(not defined $Global::halt_exitstatus) {
if($Global::halt_pct) {
$Global::halt_exitstatus =
::ceil($Global::total_failed / $Global::total_started * 100);
} elsif($Global::halt_count) {
$Global::halt_exitstatus =
::min(undef_as_zero($Global::total_failed),101);
}
}
wait_and_exit($Global::halt_exitstatus);
} else {
wait_and_exit(min(undef_as_zero($Global::exitstatus),101));
}
}
# TODO --transfer foo/./bar --cleanup
# multiple --transfer and --basefile with different /./
sub __PIPE_MODE__ {}
$Global::JobQueue = JobQueue->new(
\@command,\@input_source_fh,$Global::ContextReplace,
$number_of_args,\@Global::transfer_files,\@Global::ret_files);
if($opt::pipepart) {
sub pipepart_setup {
if($opt::tee) {
# Prepend each command with
# cat file
my $cat_string = "cat ".::shell_quote_scalar($opt::a[0]);
for(1..max_jobs_running()) {
push @Global::cat_prepends, $cat_string;
}
} else {
if(not $opt::blocksize) {
# --blocksize with 10 jobs per jobslot
$opt::blocksize = -10;
@ -156,75 +244,43 @@ if($opt::pipepart) {
wait_and_exit(255);
}
}
# Compute $Global::max_jobs_running
$Global::dummy_jobs = 1;
for my $sshlogin (values %Global::host) {
$sshlogin->max_jobs_running();
}
$Global::max_jobs_running or
::die_bug("Global::max_jobs_running not set");
# Run in total $job_slots*(- $blocksize) jobs
# Set --blocksize = size / no of proc / (- $blocksize)
$Global::dummy_jobs = 1;
$Global::blocksize = 1 +
int($size / $Global::max_jobs_running / -$opt::blocksize);
int($size / max_jobs_running() / -$opt::blocksize);
}
@Global::cat_partials = map { pipe_part_files($_) } @opt::a;
@Global::cat_prepends = map { pipe_part_files($_) } @opt::a;
# Unget the empty arg as many times as there are parts
$Global::JobQueue->{'commandlinequeue'}{'arg_queue'}->unget(
map { [Arg->new("\0")] } @Global::cat_partials
map { [Arg->new("\0")] } @Global::cat_prepends
);
}
if($opt::eta or $opt::bar or $opt::shuf or $Global::halt_pct) {
# Count the number of jobs or shuffle all jobs
# before starting any.
# Must be done after ungetting any --pipepart jobs.
$Global::JobQueue->total_jobs();
}
# Compute $Global::max_jobs_running
# Must be done after ungetting any --pipepart jobs.
for my $sshlogin (values %Global::host) {
$sshlogin->max_jobs_running();
}
init_run_jobs();
my $sem;
if($Global::semaphore) {
$sem = acquire_semaphore();
}
$SIG{TERM} = \&start_no_new_jobs;
start_more_jobs();
if($opt::pipe and not $opt::pipepart) {
spreadstdin();
}
::debug("init", "Start draining\n");
drain_job_queue();
::debug("init", "Done draining\n");
reaper();
::debug("init", "Done reaping\n");
::debug("init", "Cleaning\n");
if($Global::semaphore) {
$sem->release();
}
cleanup();
::debug("init", "Halt\n");
if($opt::halt and $Global::halt_when ne "never") {
if(not defined $Global::halt_exitstatus) {
if($Global::halt_pct) {
$Global::halt_exitstatus =
::ceil($Global::total_failed / $Global::total_started * 100);
} elsif($Global::halt_count) {
$Global::halt_exitstatus =
::min(undef_as_zero($Global::total_failed),101);
}
}
wait_and_exit($Global::halt_exitstatus);
} else {
wait_and_exit(min(undef_as_zero($Global::exitstatus),101));
sub pipe_tee_setup {
# mkfifo t1..5
my @fifos;
for(1..max_jobs_running()) {
push @fifos, tmpfifo();
}
# cat foo | tee t1 t2 t3 t4 t5 > /dev/null
if(not fork()){
# Let tee inheirit our stdin
# and redirect stdout to null
open STDOUT, ">","/dev/null";
exec "tee",@fifos;
}
sub __PIPE_MODE__ {}
# cat t1 | grep 1
# cat t2 | grep 2
# cat t3 | grep 3
# cat t4 | grep 4
# cat t5 | grep 5
# Remove the tmpfifo as soon as it is open
@Global::cat_prepends = map { "(rm $_;cat) < $_" } @fifos;
}
sub pipe_part_files {
# Input:
@ -240,12 +296,12 @@ sub pipe_part_files {
my $header = find_header(\$buf,open_or_exit($file));
# find positions
my @pos = find_split_positions($file,$Global::blocksize,length $header);
# Make @cat_partials
my @cat_partials = ();
# Make @cat_prepends
my @cat_prepends = ();
for(my $i=0; $i<$#pos; $i++) {
push @cat_partials, cat_partial($file, 0, length($header), $pos[$i], $pos[$i+1]);
push @cat_prepends, cat_partial($file, 0, length($header), $pos[$i], $pos[$i+1]);
}
return @cat_partials;
return @cat_prepends;
}
sub find_header {
@ -336,17 +392,21 @@ sub cat_partial {
# $file = the file to read
# ($start, $end, [$start2, $end2, ...]) = start byte, end byte
# Returns:
# Efficient perl command to copy $start..$end, $start2..$end2, ... to stdout
# Efficient command to copy $start..$end, $start2..$end2, ... to stdout
my($file, @start_end) = @_;
my($start, $i);
# Convert start_end to start_len
my @start_len = map { if(++$i % 2) { $start = $_; } else { $_-$start } } @start_end;
my $script = spacefree(0,
my @start_len = map {
if(++$i % 2) { $start = $_; } else { $_-$start }
} @start_end;
my $script = spacefree
(0,
q{
while(@ARGV) {
sysseek(STDIN,shift,0) || die;
$left = shift;
while($read = sysread(STDIN,$buf, $left > 131072 ? 131072 : $left)){
while($read =
sysread(STDIN,$buf, $left > 131072 ? 131072 : $left)){
$left -= $read;
syswrite(STDOUT,$buf);
}
@ -639,50 +699,6 @@ sub nindex {
}
}
{
my $sleep = 1;
sub tee_write {
# Write the block to all jobs
#
# Input:
# $header_ref = ref to $header string
# $block_ref = ref to $block to be written
# $recstart = record start string
# $recend = record end string
# $endpos = end position of $block
# Uses:
# %Global::running
# Returns:
# $written = amount of bytes written
my ($header_ref,$buffer_ref,$recstart,$recend,$endpos) = @_;
my $written = 0;
my $done = 0;
my %done;
while(not $done) {
$done = 1;
for my $job (values %Global::running) {
if(not $done{$job}) {
$done = 0;
if($job->block_length() > 0) {
# Flush old block
$written += $job->non_blocking_write();
} else {
# Give a copy of the new block
$job->set_block($header_ref,$buffer_ref,$endpos,$recstart,$recend);
$job->set_virgin(0);
$written += $job->non_blocking_write();
# Mark this job as done
$done{$job} = 1;
}
}
}
$sleep = ::reap_usleep($sleep);
}
return $written;
}
}
sub index64 {
# Do index on strings > 2GB.
# index in Perl < v5.22 does not work for > 2GB
@ -795,10 +811,6 @@ sub write_record_to_pipe {
# Write the block to one of the already running jobs
return round_robin_write($header_ref,$buffer_ref,$recstart,$recend,$endpos);
}
if($opt::tee) {
# Write the block to all jobs
return tee_write($header_ref,$buffer_ref,$recstart,$recend,$endpos);
}
# If no virgin found, backoff
my $sleep = 0.0001; # 0.01 ms - better performance on highend
while(not @Global::virgin_jobs) {
@ -1349,7 +1361,7 @@ sub check_invalid_option_combinations {
sub init_globals {
# Defaults:
$Global::version = 20170123;
$Global::version = 20170202;
$Global::progname = 'parallel';
$Global::infinity = 2**31;
$Global::debug = 0;
@ -4119,17 +4131,17 @@ sub tmpname {
do {
$tmpname = $ENV{'TMPDIR'}."/".$name.
join"", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..5);
} while($Global::unlink{$tmpname}++ or -e $tmpname);
} while(-e $tmpname or $Global::unlink{$tmpname}++);
return $tmpname;
}
#sub tmpfifo {
# # Find an unused name and mkfifo on it
# use POSIX qw(mkfifo);
# my $tmpfifo = tmpname("fif",@_);
# mkfifo($tmpfifo,0600);
# return $tmpfifo;
#}
sub tmpfifo {
# Find an unused name and mkfifo on it
use POSIX qw(mkfifo);
my $tmpfifo = tmpname("fif",@_);
mkfifo($tmpfifo,0600);
return $tmpfifo;
}
sub rm {
# Remove file and remove it from %Global::unlink
@ -7129,7 +7141,8 @@ sub wrapped {
# * --cat
# * --fifo
# * --sshlogin
# * --pipepart (@Global::cat_partials)
# * --pipepart (@Global::cat_prepends)
# * --tee (@Global::cat_prepends)
# * --pipe
# * --tmux
# The ordering of the wrapping is important:
@ -7141,7 +7154,7 @@ sub wrapped {
# $Global::shell
# $opt::cat
# $opt::fifo
# @Global::cat_partials
# @Global::cat_prepends
# $opt::pipe
# $opt::tmux
# Returns:
@ -7201,15 +7214,21 @@ sub wrapped {
}
# Wrap with ssh + tranferring of files
$command = $self->sshlogin_wrap($command);
if(@Global::cat_partials) {
# Prepend:
if(@Global::cat_prepends) {
# --pipepart: prepend:
# < /tmp/foo perl -e 'while(@ARGV) {
# sysseek(STDIN,shift,0) || die; $left = shift;
# while($read = sysread(STDIN,$buf, ($left > 131072 ? 131072 : $left))){
# $left -= $read; syswrite(STDOUT,$buf);
# }
# }' 0 0 0 11 |
$command = (shift @Global::cat_partials). " | ($command)";
#
# --pipepart --tee: prepend:
# cat dash-a-file |
#
# --pipe --tee: prepend:
# cat fifo |
$command = (shift @Global::cat_prepends). " | ($command)";
} elsif($opt::pipe) {
# Wrap with EOF-detector to avoid starting $command if EOF.
$command = empty_input_wrapper($command);

View file

@ -1772,7 +1772,7 @@ Here are a few examples:
Is the job sequence even or odd?
--rpl '{odd} $_ = seq() % 2 ? "odd" : "even"'
Pad job sequence with leading zeros to get equal width
--rpl '{0#} $f = "%0".int(1+log(total_jobs())/log(10))."d"; $_=sprintf($f,seq())'
--rpl '{0#} $f=1+int("".(log(total_jobs())/log(10))); $_=sprintf("%0${f}d",seq())'
Job sequence counting from 0
--rpl '{#0} $_ = seq() - 1'
Job slot counting from 2
@ -2170,10 +2170,16 @@ B<--tagstring> is ignored when using B<-u>, B<--onall>, and B<--nonall>.
=item B<--tee> (alpha testing)
Pipe all data to all jobs. Used with B<--pipe> and B<:::>.
Pipe all data to all jobs. Used with B<--pipe>/B<--pipepart> and
B<:::>.
seq 1000 | parallel --pipe --tee -v wc {} ::: -w -l -c
How many numbers in 1..1000 contain 0..9, and how many bytes do they
fill:
seq 1000 | parallel --pipe --tee --tag 'grep {1} | wc {2}' ::: {0..9} ::: -l -c
=item B<--termseq> I<sequence>

View file

@ -675,6 +675,7 @@ par_retries_replacement_string() {
par_tee() {
export PARALLEL='-k --tee --pipe --tag'
seq 1000000 | parallel 'echo {%};LANG=C wc' ::: {1..5} ::: {a..b}
seq 300000 | parallel 'grep {1} | LANG=C wc {2}' ::: {1..5} ::: -l -c
}
par_tagstring_pipe() {

View file

@ -239,6 +239,31 @@ par_tmux_fg() {
stdout parallel --tmux --fg sleep ::: 3 | perl -pe 's/.tmp\S+/tmp/'
}
par_pipe_tee() {
echo 'bug #45479: --pipe/--pipepart --tee'
echo '--pipe --tee'
random1G() {
< /dev/zero openssl enc -aes-128-ctr -K 1234 -iv 1234 2>/dev/null |
head -c 1G;
}
random1G | parallel --pipe --tee cat ::: {1..3} | LANG=C wc -c
}
par_pipepart_tee() {
echo 'bug #45479: --pipe/--pipepart --tee'
echo '--pipepart --tee'
random1G() {
< /dev/zero openssl enc -aes-128-ctr -K 1234 -iv 1234 2>/dev/null |
head -c 1G;
}
tmp=$(mktemp)
random1G >$tmp
parallel --pipepart --tee -a $tmp cat ::: {1..3} | LANG=C wc -c
rm $tmp
}
export -f $(compgen -A function | grep par_)
compgen -A function | grep par_ | sort |
parallel --joblog /tmp/jl-`basename $0` -j10 --tag -k '{} 2>&1'

View file

@ -74,6 +74,8 @@ perl -ne '$/="\n\n"; /^Output/../^[^O]\S/ and next; /^ / and print;' ../../src/
# Timings are often off
s/^(\d)$/9/;
s/^(\d\d)$/99/;
# Sometime these vars are not present
s/^PAM_KWALLET5*_LOGIN$//;
# Fails often due to race
s/cat: input_file: No such file or directory\n//;
s{rsync: link_stat ".*/home/parallel/input_file.out" .*\n}{};

View file

@ -1654,3 +1654,13 @@ par_tee 5 a 9
par_tee 5 a 1000000 1000000 6888896
par_tee 5 b 10
par_tee 5 b 1000000 1000000 6888896
par_tee 1 -l 181902
par_tee 1 -c 1228633
par_tee 2 -l 181902
par_tee 2 -c 1228633
par_tee 3 -l 122854
par_tee 3 -c 815297
par_tee 4 -l 122853
par_tee 4 -c 815290
par_tee 5 -l 122853
par_tee 5 -c 815290

View file

@ -492,8 +492,14 @@ par_memleak should give 1 == true
par_memleak Memory use should not depend very much on the total number of jobs run\n
par_memleak Test if memory consumption(300 jobs) < memory consumption(30 jobs) * 110%
par_memleak 1
par_pipe_tee bug #45479: --pipe/--pipepart --tee
par_pipe_tee --pipe --tee
par_pipe_tee 3221225472
par_pipepart_spawn ### bug #46214: Using --pipepart doesnt spawn multiple jobs in version 20150922
par_pipepart_spawn 1:local / 8 / 999
par_pipepart_tee bug #45479: --pipe/--pipepart --tee
par_pipepart_tee --pipepart --tee
par_pipepart_tee 3221225472
par_print_before_halt_on_error ### What is printed before the jobs are killed
par_print_before_halt_on_error -2 exit code 0
par_print_before_halt_on_error -2 0 0

View file

@ -852,8 +852,6 @@ MFLAGS
MOZ_NO_REMOTE
ORACLE_HOME
ORACLE_SID
PAM_KWALLET5_LOGIN
PAM_KWALLET_LOGIN
PARALLEL
PARALLEL_PID
PARALLEL_SEQ