parallel: move job failure and print_earlier into Job object.

This commit is contained in:
Ole Tange 2014-12-29 01:16:32 +01:00
parent fe08937ea9
commit 0992d1a7c9
4 changed files with 181 additions and 108 deletions

View file

@ -32,8 +32,8 @@ measure() {
CORES=$3 CORES=$3
VERSION=$4 VERSION=$4
# Force cpuspeed at 1.7GHz # Force cpuspeed at 1.7GHz - seems to give tighter results
forever 'parallel sudo cpufreq-set -f 1700MHz -c{} ::: {0..7}' & forever 'sleep 10;parallel sudo cpufreq-set -f 1700MHz -c{} ::: {0..7}' &
PATH=/tmp/bin:$PATH PATH=/tmp/bin:$PATH
cd /tmp/bin cd /tmp/bin
@ -53,4 +53,4 @@ _
evince /tmp/boxplot.pdf evince /tmp/boxplot.pdf
} }
measure 300 1000 8 1 measure 3000 1000 8 1

View file

@ -226,9 +226,9 @@ cc:Tim Cuthbertson <tim3d.junk@gmail.com>,
Ryoichiro Suzuki <ryoichiro.suzuki@gmail.com>, Ryoichiro Suzuki <ryoichiro.suzuki@gmail.com>,
Jesse Alama <jesse.alama@gmail.com> Jesse Alama <jesse.alama@gmail.com>
Subject: GNU Parallel 20141222 ('Manila') released Subject: GNU Parallel 20150122 ('Manila SQ8501') released
GNU Parallel 20141222 ('Manila') has been released. It is available for download at: http://ftp.gnu.org/gnu/parallel/ GNU Parallel 20150122 ('Manila SQ8501') has been released. It is available for download at: http://ftp.gnu.org/gnu/parallel/
Haiku of the month: Haiku of the month:

View file

@ -158,6 +158,7 @@ if($Global::semaphore) {
$sem->release(); $sem->release();
} }
for(keys %Global::sshmaster) { for(keys %Global::sshmaster) {
# If 'ssh -M's are running: kill them
kill "TERM", $_; kill "TERM", $_;
} }
::debug("init", "Halt\n"); ::debug("init", "Halt\n");
@ -1186,7 +1187,8 @@ sub parse_env_var {
if(not @qcsh) { push @qcsh, "true"; } if(not @qcsh) { push @qcsh, "true"; }
if(not @qbash) { push @qbash, "true"; } if(not @qbash) { push @qbash, "true"; }
# Create lines like: # Create lines like:
# echo $SHELL | grep "/t\\{0,1\\}csh" >/dev/null && setenv V1 val1 && setenv V2 val2 || export V1=val1 && export V2=val2 ; echo "$V1$V2" # echo $SHELL | grep "/t\\{0,1\\}csh" >/dev/null && setenv V1 val1 &&
# setenv V2 val2 || export V1=val1 && export V2=val2 ; echo "$V1$V2"
if(@vars) { if(@vars) {
$Global::envvar .= $Global::envvar .=
join"", join"",
@ -1641,6 +1643,9 @@ sub __RUNNING_THE_JOBS_AND_PRINTING_PROGRESS__ {}
# $Global::total_started = total jobs started # $Global::total_started = total jobs started
sub init_run_jobs { sub init_run_jobs {
# Set Global variables and progress signal handlers
# Do the copying of basefiles
# Returns: N/A
$Global::total_running = 0; $Global::total_running = 0;
$Global::total_started = 0; $Global::total_started = 0;
$Global::tty_taken = 0; $Global::tty_taken = 0;
@ -1654,6 +1659,13 @@ sub init_run_jobs {
my %last_mtime; my %last_mtime;
sub changed_procs_file { sub changed_procs_file {
# If --jobs is a file and it is modfied:
# Force recomputing of max_jobs_running for each $sshlogin
# Uses:
# $Global::max_procs_file
# $Global::max_procs_file_last_mod
# %Global::host
# Returns: N/A
if($Global::max_procs_file) { if($Global::max_procs_file) {
# --jobs filename # --jobs filename
my $mtime = (stat($Global::max_procs_file))[9]; my $mtime = (stat($Global::max_procs_file))[9];
@ -1666,7 +1678,18 @@ sub init_run_jobs {
} }
} }
} }
sub changed_sshloginfile { sub changed_sshloginfile {
# If --slf is changed:
# reload --slf
# filter_hosts
# setup_basefile
# Uses:
# @opt::sshloginfile
# @Global::sshlogin
# %Global::host
# $opt::filter_hosts
# Returns: N/A
if(@opt::sshloginfile) { if(@opt::sshloginfile) {
# Is --sshloginfile changed? # Is --sshloginfile changed?
for my $slf (@opt::sshloginfile) { for my $slf (@opt::sshloginfile) {
@ -1702,12 +1725,8 @@ sub init_run_jobs {
# * not server swapping # * not server swapping
# * not too short time since last remote login # * not too short time since last remote login
# Uses: # Uses:
# $Global::max_procs_file
# $Global::max_procs_file_last_mod
# %Global::host # %Global::host
# @opt::sshloginfile
# $Global::start_no_new_jobs # $Global::start_no_new_jobs
# $opt::filter_hosts
# $Global::JobQueue # $Global::JobQueue
# $opt::pipe # $opt::pipe
# $opt::load # $opt::load
@ -2481,49 +2500,50 @@ sub filter_hosts {
# %Global::host # %Global::host
# $Global::minimal_command_line_length # $Global::minimal_command_line_length
# $opt::use_cpus_instead_of_cores # $opt::use_cpus_instead_of_cores
# Returns: # Returns: N/A
# N/A
my (%ncores, %ncpus, %time_to_login, %maxlen, %echo, @down_hosts);
my ($ncores_ref, $ncpus_ref, $time_to_login_ref, $maxlen_ref, my ($ncores_ref, $ncpus_ref, $time_to_login_ref, $maxlen_ref,
$echo_ref, $down_hosts_ref) = $echo_ref, $down_hosts_ref) =
parse_host_filtering(parallelized_host_filtering()); parse_host_filtering(parallelized_host_filtering());
%ncores = %$ncores_ref;
%ncpus = %$ncpus_ref;
%time_to_login = %$time_to_login_ref;
%maxlen = %$maxlen_ref;
%echo = %$echo_ref;
@down_hosts = @$down_hosts_ref;
delete @Global::host{@down_hosts}; delete @Global::host{@$down_hosts_ref};
@down_hosts and ::warning("Removed @down_hosts\n"); @$down_hosts_ref and ::warning("Removed @$down_hosts_ref\n");
$Global::minimal_command_line_length = 8_000_000; $Global::minimal_command_line_length = 8_000_000;
while (my ($sshlogin, $obj) = each %Global::host) { while (my ($sshlogin, $obj) = each %Global::host) {
if($sshlogin eq ":") { next } if($sshlogin eq ":") { next }
$ncpus{$sshlogin} or ::die_bug("ncpus missing: ".$obj->serverlogin()); $ncpus_ref->{$sshlogin} or ::die_bug("ncpus missing: ".$obj->serverlogin());
$ncores{$sshlogin} or ::die_bug("ncores missing: ".$obj->serverlogin()); $ncores_ref->{$sshlogin} or ::die_bug("ncores missing: ".$obj->serverlogin());
$time_to_login{$sshlogin} or ::die_bug("time_to_login missing: ".$obj->serverlogin()); $time_to_login_ref->{$sshlogin} or ::die_bug("time_to_login missing: ".$obj->serverlogin());
$maxlen{$sshlogin} or ::die_bug("maxlen missing: ".$obj->serverlogin()); $maxlen_ref->{$sshlogin} or ::die_bug("maxlen missing: ".$obj->serverlogin());
if($opt::use_cpus_instead_of_cores) { if($opt::use_cpus_instead_of_cores) {
$obj->set_ncpus($ncpus{$sshlogin}); $obj->set_ncpus($ncpus_ref->{$sshlogin});
} else { } else {
$obj->set_ncpus($ncores{$sshlogin}); $obj->set_ncpus($ncores_ref->{$sshlogin});
} }
$obj->set_time_to_login($time_to_login{$sshlogin}); $obj->set_time_to_login($time_to_login_ref->{$sshlogin});
$obj->set_maxlength($maxlen{$sshlogin}); $obj->set_maxlength($maxlen_ref->{$sshlogin});
$Global::minimal_command_line_length = $Global::minimal_command_line_length =
::min($Global::minimal_command_line_length, ::min($Global::minimal_command_line_length,
int($maxlen{$sshlogin}/2)); int($maxlen_ref->{$sshlogin}/2));
::debug("init", "Timing from -S:$sshlogin ncpus:",$ncpus{$sshlogin}, ::debug("init", "Timing from -S:$sshlogin ncpus:",$ncpus_ref->{$sshlogin},
" ncores:", $ncores{$sshlogin}, " ncores:", $ncores_ref->{$sshlogin},
" time_to_login:", $time_to_login{$sshlogin}, " time_to_login:", $time_to_login_ref->{$sshlogin},
" maxlen:", $maxlen{$sshlogin}, " maxlen:", $maxlen_ref->{$sshlogin},
" min_max_len:", $Global::minimal_command_line_length,"\n"); " min_max_len:", $Global::minimal_command_line_length,"\n");
} }
} }
sub parse_host_filtering { sub parse_host_filtering {
# Input:
# @lines = output from parallelized_host_filtering()
# Returns:
# \%ncores = number of cores of {host}
# \%ncpus = number of cpus of {host}
# \%time_to_login = time_to_login on {host}
# \%maxlen = max command len on {host}
# \%echo = echo received from {host}
# \@down_hosts = list of hosts with no answer
my (%ncores, %ncpus, %time_to_login, %maxlen, %echo, @down_hosts); my (%ncores, %ncpus, %time_to_login, %maxlen, %echo, @down_hosts);
for (@_) { for (@_) {
@ -2651,7 +2671,46 @@ sub parallelized_host_filtering {
} }
sub onall { sub onall {
# Runs @command on all hosts.
# Uses parallel to run @command on each host.
# --jobs = number of hosts to run on simultaneously.
# For each host a parallel command with the args will be running.
# Uses:
# $Global::quoting
# @opt::basefile
# $opt::jobs
# $opt::linebuffer
# $opt::ungroup
# $opt::group
# $opt::keeporder
# $opt::D
# $opt::plain
# $opt::max_chars
# $opt::linebuffer
# $opt::files
# $opt::colsep
# $opt::timeout
# $opt::plain
# $opt::retries
# $opt::max_chars
# $opt::arg_sep
# $opt::arg_file_sep
# @opt::v
# @opt::env
# %Global::host
# $Global::exitstatus
# $Global::debug
# $Global::joblog
# $opt::tag
# $opt::joblog
# Input:
# @command = command to run on all hosts
# Returns: N/A
sub tmp_joblog { sub tmp_joblog {
# Input:
# $joblog = filename of joblog - undef if none
# Returns:
# $tmpfile = temp file for joblog - undef if none
my $joblog = shift; my $joblog = shift;
if(not defined $joblog) { if(not defined $joblog) {
return undef; return undef;
@ -2665,7 +2724,7 @@ sub onall {
@command = shell_quote_empty(@command); @command = shell_quote_empty(@command);
} }
# Copy all @fhlist into tempfiles # Copy all @fhlist (-a and :::) into tempfiles
my @argfiles = (); my @argfiles = ();
for my $fh (@fhlist) { for my $fh (@fhlist) {
my ($outfh, $name) = ::tmpfile(SUFFIX => ".all", UNLINK => 1); my ($outfh, $name) = ::tmpfile(SUFFIX => ".all", UNLINK => 1);
@ -2765,6 +2824,7 @@ sub save_original_signal_handler {
} }
sub list_running_jobs { sub list_running_jobs {
# Print running jobs on tty
# Returns: N/A # Returns: N/A
for my $v (values %Global::running) { for my $v (values %Global::running) {
::status("$Global::progname: ",$v->replaced(),"\n"); ::status("$Global::progname: ",$v->replaced(),"\n");
@ -2772,6 +2832,7 @@ sub list_running_jobs {
} }
sub start_no_new_jobs { sub start_no_new_jobs {
# Start no more jobs
# Returns: N/A # Returns: N/A
$SIG{TERM} = $Global::original_sig{TERM}; $SIG{TERM} = $Global::original_sig{TERM};
::status ::status
@ -2786,11 +2847,22 @@ sub reaper {
# A job finished. # A job finished.
# Print the output. # Print the output.
# Start another job # Start another job
# Uses:
# %Global::sshmaster
# %Global::running
# $Global::tty_taken
# @Global::slots
# $opt::timeout
# $Global::timeoutq
# $opt::halt_on_error
# $opt::keeporder
# $Global::total_running
# Returns: N/A # Returns: N/A
my $stiff; my $stiff;
my $children_reaped = 0; my $children_reaped = 0;
debug("run", "Reaper "); debug("run", "Reaper ");
while (($stiff = waitpid(-1, &WNOHANG)) > 0) { while (($stiff = waitpid(-1, &WNOHANG)) > 0) {
# $stiff = pid of dead process
$children_reaped++; $children_reaped++;
if($Global::sshmaster{$stiff}) { if($Global::sshmaster{$stiff}) {
# This is one of the ssh -M: ignore # This is one of the ssh -M: ignore
@ -2820,12 +2892,12 @@ sub reaper {
my $print_now = ($opt::halt_on_error and $opt::halt_on_error == 2 my $print_now = ($opt::halt_on_error and $opt::halt_on_error == 2
and $job->exitstatus()); and $job->exitstatus());
if($opt::keeporder and not $print_now) { if($opt::keeporder and not $print_now) {
print_earlier_jobs($job); $job->print_earlier_jobs();
} else { } else {
$job->print(); $job->print();
} }
if($job->exitstatus()) { if($job->exitstatus()) {
process_failed_job($job); $job->fail();
} }
} }
@ -2840,61 +2912,6 @@ sub reaper {
return $children_reaped; return $children_reaped;
} }
sub process_failed_job {
# The jobs had a exit status <> 0, so error
# Returns: N/A
my $job = shift;
$Global::exitstatus++;
$Global::total_failed++;
if($opt::halt_on_error) {
if($opt::halt_on_error == 1
or
($opt::halt_on_error < 1 and $Global::total_failed > 3
and
$Global::total_failed / $Global::total_started > $opt::halt_on_error)) {
# If halt on error == 1 or --halt 10%
# we should gracefully exit
::status
("$Global::progname: Starting no more jobs. ",
"Waiting for ", scalar(keys %Global::running),
" jobs to finish. This job failed:\n",
$job->replaced(),"\n");
$Global::start_no_new_jobs ||= 1;
$Global::halt_on_error_exitstatus = $job->exitstatus();
} elsif($opt::halt_on_error == 2) {
# If halt on error == 2 we should exit immediately
::status
("$Global::progname: This job failed:\n",
$job->replaced(),"\n");
exit ($job->exitstatus());
}
}
}
{
my (%print_later,$job_end_sequence);
sub print_earlier_jobs {
# Print jobs completed earlier
# Returns: N/A
my $job = shift;
$print_later{$job->seq()} = $job;
$job_end_sequence ||= 1;
debug("run", "Looking for: $job_end_sequence ",
"Current: ", $job->seq(), "\n");
for(my $j = $print_later{$job_end_sequence};
$j or vec($Global::job_already_run,$job_end_sequence,1);
$job_end_sequence++,
$j = $print_later{$job_end_sequence}) {
debug("run", "Found job end $job_end_sequence");
if($j) {
$j->print();
delete $print_later{$job_end_sequence};
}
}
}
}
sub __USAGE__ {} sub __USAGE__ {}
sub wait_and_exit { sub wait_and_exit {
@ -6367,6 +6384,30 @@ sub should_be_retried {
} }
} }
{
my (%print_later,$job_end_sequence);
sub print_earlier_jobs {
# Print jobs completed earlier
# Returns: N/A
my $job = shift;
$print_later{$job->seq()} = $job;
$job_end_sequence ||= 1;
::debug("run", "Looking for: $job_end_sequence ",
"Current: ", $job->seq(), "\n");
for(my $j = $print_later{$job_end_sequence};
$j or vec($Global::job_already_run,$job_end_sequence,1);
$job_end_sequence++,
$j = $print_later{$job_end_sequence}) {
::debug("run", "Found job end $job_end_sequence");
if($j) {
$j->print();
delete $print_later{$job_end_sequence};
}
}
}
}
sub print { sub print {
# Print the output of the jobs # Print the output of the jobs
# Returns: N/A # Returns: N/A
@ -6480,8 +6521,8 @@ sub linebuffer_print {
# 327680 --tag = 4.4s # 327680 --tag = 4.4s
# 1024000 --tag = 4.4s # 1024000 --tag = 4.4s
# 3276800 --tag = 4.3s # 3276800 --tag = 4.3s
# 32768000 --tag = 4.7s
# 10240000 --tag = 4.3s # 10240000 --tag = 4.3s
# 32768000 --tag = 4.7s
while(read($in_fh,substr($$partial,length $$partial),3276800)) { while(read($in_fh,substr($$partial,length $$partial),3276800)) {
# Append to $$partial # Append to $$partial
# Find the last \n # Find the last \n
@ -6685,6 +6726,36 @@ sub set_exitsignal {
} }
} }
sub fail {
# The jobs had a exit status <> 0, so error
# Returns: N/A
my $job = shift;
$Global::exitstatus++;
$Global::total_failed++;
if($opt::halt_on_error) {
if($opt::halt_on_error == 1
or
($opt::halt_on_error < 1 and $Global::total_failed > 3
and
$Global::total_failed / $Global::total_started > $opt::halt_on_error)) {
# If halt on error == 1 or --halt 10%
# we should gracefully exit
::status
("$Global::progname: Starting no more jobs. ",
"Waiting for ", scalar(keys %Global::running),
" jobs to finish. This job failed:\n",
$job->replaced(),"\n");
$Global::start_no_new_jobs ||= 1;
$Global::halt_on_error_exitstatus = $job->exitstatus();
} elsif($opt::halt_on_error == 2) {
# If halt on error == 2 we should exit immediately
::status
("$Global::progname: This job failed:\n",
$job->replaced(),"\n");
exit ($job->exitstatus());
}
}
}
package CommandLine; package CommandLine;
@ -6730,7 +6801,7 @@ sub seq {
sub slot { sub slot {
# Find the number of a free job slot and return it # Find the number of a free job slot and return it
# Uses: # Uses:
# @Global::slots # @Global::slots - list with free jobslots
# Returns: # Returns:
# $jobslot = number of jobslot # $jobslot = number of jobslot
my $self = shift; my $self = shift;

View file

@ -983,12 +983,14 @@ servers.
Run all the jobs on all computers given with B<--sshlogin>. GNU Run all the jobs on all computers given with B<--sshlogin>. GNU
B<parallel> will log into B<--jobs> number of computers in parallel B<parallel> will log into B<--jobs> number of computers in parallel
and run one job at a time on the computer. The order of the jobs will and run one job at a time on the computer. The order of the jobs will
not be changed, but some computers may finish before others. B<-j> not be changed, but some computers may finish before others.
adjusts how many computers to log into in parallel.
When using B<--group> the output will be grouped by each server, so When using B<--group> the output will be grouped by each server, so
all the output from one server will be grouped together. all the output from one server will be grouped together.
B<--joblog> will contain an entry for each job on each server, so
there will be several job sequence 1.
=item B<--output-as-files> =item B<--output-as-files>