mirror of
https://git.savannah.gnu.org/git/parallel.git
synced 2024-11-22 14:07:55 +00:00
parallel: If --block is left out, --pipepart will use a block size that will result in 10 jobs per jobslot.
This commit is contained in:
parent
d570ec2d20
commit
51f212e548
42
src/parallel
42
src/parallel
|
@ -120,17 +120,30 @@ $Global::JobQueue = JobQueue->new(
|
|||
$number_of_args,\@Global::transfer_files,\@Global::ret_files);
|
||||
|
||||
if($opt::pipepart) {
|
||||
if($opt::roundrobin) {
|
||||
# Compute size of -a
|
||||
if(not $opt::blocksize or $opt::roundrobin) {
|
||||
# --block not set =>
|
||||
# compute reasonable value giving 10 jobs per jobslot
|
||||
# --roundrobin => divide equally between jobslots
|
||||
my $size = 0;
|
||||
# Compute size of -a
|
||||
$size += -s $_ for @opt::a;
|
||||
# Compute $Global::max_jobs_running
|
||||
$Global::dummy_jobs = 1;
|
||||
for my $sshlogin (values %Global::host) {
|
||||
$sshlogin->max_jobs_running();
|
||||
}
|
||||
$Global::max_jobs_running or ::die_bug("Global::max_jobs_running not set");
|
||||
# Set --blocksize = size / no of proc
|
||||
$opt::blocksize = 1 + $size / $Global::max_jobs_running;
|
||||
$Global::max_jobs_running or
|
||||
::die_bug("Global::max_jobs_running not set");
|
||||
if($opt::roundrobin) {
|
||||
# Run in total $job_slots jobs
|
||||
# Set --blocksize = size / no of proc
|
||||
$Global::blocksize = 1 + int($size / $Global::max_jobs_running);
|
||||
} else {
|
||||
# Run in total $job_slots*10 jobs
|
||||
# Set --blocksize = size / no of proc / 10
|
||||
$Global::blocksize = 1 +
|
||||
int($size / $Global::max_jobs_running / 10);
|
||||
}
|
||||
}
|
||||
@Global::cat_partials = map { pipe_part_files($_) } @opt::a;
|
||||
# Unget the empty arg as many times as there are parts
|
||||
|
@ -215,7 +228,7 @@ sub pipe_part_files {
|
|||
}
|
||||
my $header = find_header(\$buf,open_or_exit($file));
|
||||
# find positions
|
||||
my @pos = find_split_positions($file,$opt::blocksize,length $header);
|
||||
my @pos = find_split_positions($file,$Global::blocksize,length $header);
|
||||
# Make @cat_partials
|
||||
my @cat_partials = ();
|
||||
for(my $i=0; $i<$#pos; $i++) {
|
||||
|
@ -230,7 +243,7 @@ sub find_header {
|
|||
# $fh = filehandle to read from
|
||||
# Uses:
|
||||
# $opt::header
|
||||
# $opt::blocksize
|
||||
# $Global::blocksize
|
||||
# Returns:
|
||||
# $header string
|
||||
my ($buf_ref, $fh) = @_;
|
||||
|
@ -239,7 +252,7 @@ sub find_header {
|
|||
if($opt::header eq ":") { $opt::header = "(.*\n)"; }
|
||||
# Number = number of lines
|
||||
$opt::header =~ s/^(\d+)$/"(.*\n)"x$1/e;
|
||||
while(read($fh,substr($$buf_ref,length $$buf_ref,0),$opt::blocksize)) {
|
||||
while(read($fh,substr($$buf_ref,length $$buf_ref,0),$Global::blocksize)) {
|
||||
if($$buf_ref=~s/^($opt::header)//) {
|
||||
$header = $1;
|
||||
last;
|
||||
|
@ -318,7 +331,7 @@ sub spreadstdin {
|
|||
# read a record
|
||||
# Spawn a job and print the record to it.
|
||||
# Uses:
|
||||
# $opt::blocksize
|
||||
# $Global::blocksize
|
||||
# STDIN
|
||||
# $opt::r
|
||||
# $Global::max_lines
|
||||
|
@ -335,7 +348,7 @@ sub spreadstdin {
|
|||
my $chunk_number = 1;
|
||||
my $one_time_through;
|
||||
my $two_gb = 2**31-1;
|
||||
my $blocksize = $opt::blocksize;
|
||||
my $blocksize = $Global::blocksize;
|
||||
my $in = *STDIN;
|
||||
my $header = find_header(\$buf,$in);
|
||||
while(1) {
|
||||
|
@ -998,11 +1011,10 @@ sub parse_options {
|
|||
if(@opt::transfer_files) { push @Global::transfer_files, @opt::transfer_files; }
|
||||
if(not defined $opt::recstart and
|
||||
not defined $opt::recend) { $opt::recend = "\n"; }
|
||||
if(not defined $opt::blocksize) { $opt::blocksize = "1M"; }
|
||||
$opt::blocksize = multiply_binary_prefix($opt::blocksize);
|
||||
if($opt::blocksize > 2**31-1) {
|
||||
$Global::blocksize = multiply_binary_prefix($opt::blocksize || "1M");
|
||||
if($Global::blocksize > 2**31-1) {
|
||||
warning("--blocksize >= 2G causes problems. Using 2G-1.");
|
||||
$opt::blocksize = 2**31-1;
|
||||
$Global::blocksize = 2**31-1;
|
||||
}
|
||||
$opt::memfree = multiply_binary_prefix($opt::memfree);
|
||||
check_invalid_option_combinations();
|
||||
|
@ -5130,7 +5142,7 @@ sub compute_number_of_processes {
|
|||
$max_system_proc_reached and last;
|
||||
|
||||
my $before_getting_arg = time;
|
||||
if(!$opt::roundrobin) {
|
||||
if(!$Global::dummy_jobs) {
|
||||
get_args_or_jobs() or last;
|
||||
}
|
||||
$wait_time_for_getting_args += time - $before_getting_arg;
|
||||
|
|
|
@ -1133,13 +1133,19 @@ B<--files> is often used with B<--pipe>.
|
|||
B<--pipe> maxes out at around 1 GB/s input, and 100 MB/s output. If
|
||||
performance is important use B<--pipepart>.
|
||||
|
||||
See also: B<--recstart>, B<--recend>, B<--fifo>, B<--cat>, B<--pipepart>.
|
||||
See also: B<--recstart>, B<--recend>, B<--fifo>, B<--cat>,
|
||||
B<--pipepart>, B<--files>.
|
||||
|
||||
|
||||
=item B<--pipepart>
|
||||
|
||||
Pipe parts of a physical file. B<--pipepart> works similar to
|
||||
B<--pipe>, but is much faster. It has a few limitations:
|
||||
B<--pipe>, but is much faster.
|
||||
|
||||
If B<--block> is left out, B<--pipepart> will use a block size that
|
||||
will result in 10 jobs per jobslot.
|
||||
|
||||
B<--pipepart> has a few limitations:
|
||||
|
||||
=over 3
|
||||
|
||||
|
@ -2866,7 +2872,7 @@ Or if the regexps are fixed strings:
|
|||
grep -F -f regexps.txt bigfile
|
||||
|
||||
There are 2 limiting factors: CPU and disk I/O. CPU is easy to
|
||||
measure: If the grep takes >90% CPU (e.g. when running top), then the
|
||||
measure: If the B<grep> takes >90% CPU (e.g. when running top), then the
|
||||
CPU is a limiting factor, and parallelization will speed this up. If
|
||||
not, then disk I/O is the limiting factor, and depending on the disk
|
||||
system it may be faster or slower to parallelize. The only way to know
|
||||
|
@ -2876,23 +2882,23 @@ If the CPU is the limiting factor parallelization should be done on the regexps:
|
|||
|
||||
cat regexp.txt | parallel --pipe -L1000 --round-robin grep -f - bigfile
|
||||
|
||||
If a line matches multiple regexps, the line may be duplicated. The command
|
||||
will start one grep per CPU and read bigfile one time per CPU,
|
||||
but as that is done in parallel, all reads except the first will be
|
||||
cached in RAM. Depending on the size of regexp.txt it may be faster to
|
||||
use --block 10m instead of -L1000. If regexp.txt is too big to fit in
|
||||
RAM, remove --round-robin and adjust -L1000. This will cause bigfile
|
||||
to be read more times.
|
||||
If a line matches multiple regexps, the line may be duplicated. The
|
||||
command will start one B<grep> per CPU and read I<bigfile> one time
|
||||
per CPU, but as that is done in parallel, all reads except the first
|
||||
will be cached in RAM. Depending on the size of I<regexp.txt> it may
|
||||
be faster to use B<--block 10m> instead of B<-L1000>. If I<regexp.txt>
|
||||
is too big to fit in RAM, remove B<--round-robin> and adjust
|
||||
B<-L1000>. This will cause I<bigfile> to be read more times.
|
||||
|
||||
Some storage systems perform better when reading multiple chunks in
|
||||
parallel. This is true for some RAID systems and for some network file
|
||||
systems. To parallelize the reading of bigfile:
|
||||
systems. To parallelize the reading of I<bigfile>:
|
||||
|
||||
parallel --pipepart --block 100M -a bigfile -k grep -f regexp.txt
|
||||
|
||||
This will split bigfile into 100MB chunks and run grep on each of
|
||||
these chunks. To parallelize both reading of bigfile and regexp.txt
|
||||
combine the two using --fifo:
|
||||
This will split I<bigfile> into 100MB chunks and run B<grep> on each of
|
||||
these chunks. To parallelize both reading of I<bigfile> and I<regexp.txt>
|
||||
combine the two using B<--fifo>:
|
||||
|
||||
parallel --pipepart --block 100M -a bigfile --fifo cat regexp.txt \
|
||||
\| parallel --pipe -L1000 --round-robin grep -f - {}
|
||||
|
@ -4860,8 +4866,9 @@ it also uses rsync with ssh.
|
|||
|
||||
=head1 SEE ALSO
|
||||
|
||||
B<ssh>(1), B<rsync>(1), B<find>(1), B<xargs>(1), B<dirname>(1),
|
||||
B<make>(1), B<pexec>(1), B<ppss>(1), B<xjobs>(1), B<prll>(1),
|
||||
B<dxargs>(1), B<mdm>(1)
|
||||
B<ssh>(1), B<ssh-agent>(1), B<sshpass>(1), B<ssh-copy-id>(1),
|
||||
B<rsync>(1), B<find>(1), B<xargs>(1), B<dirname>(1), B<make>(1),
|
||||
B<pexec>(1), B<ppss>(1), B<xjobs>(1), B<prll>(1), B<dxargs>(1),
|
||||
B<mdm>(1)
|
||||
|
||||
=cut
|
||||
|
|
|
@ -1819,7 +1819,7 @@
|
|||
|
||||
<p>When using <b>--cat</b>, <b>--pipepart</b>, or when a job is run on a remote machine, the command is wrapped with helper scripts. <b>-vv</b> shows all of this.</p>
|
||||
|
||||
<pre><code> parallel -vv --pipepart wc :::: num30000</code></pre>
|
||||
<pre><code> parallel -vv --pipepart --block 1M wc :::: num30000</code></pre>
|
||||
|
||||
<p>Output:</p>
|
||||
|
||||
|
|
|
@ -1816,7 +1816,7 @@ When using B<--cat>, B<--pipepart>, or when a job is run on a remote
|
|||
machine, the command is wrapped with helper scripts. B<-vv> shows all
|
||||
of this.
|
||||
|
||||
parallel -vv --pipepart wc :::: num30000
|
||||
parallel -vv --pipepart --block 1M wc :::: num30000
|
||||
|
||||
Output:
|
||||
|
||||
|
|
|
@ -596,6 +596,16 @@ echo '### bug #34422: parallel -X --eta crashes with div by zero'
|
|||
# We do not care how long it took
|
||||
seq 2 | stdout parallel -X --eta echo | grep -E -v 'ETA:.*AVG'
|
||||
|
||||
echo '**'
|
||||
|
||||
echo '### --pipepart autoset --block => 10*joblots'
|
||||
|
||||
seq 1000 > /run/shm/parallel$$;
|
||||
parallel -j2 -k --pipepart echo {#} :::: /run/shm/parallel$$;
|
||||
rm /run/shm/parallel$$
|
||||
|
||||
echo '**'
|
||||
|
||||
|
||||
EOF
|
||||
echo '### 1 .par file from --files expected'
|
||||
|
|
|
@ -1638,5 +1638,32 @@ Computers / CPU cores / Max jobs to run
|
|||
1:local / 8 / 2
|
||||
|
||||
Computer:jobs running/jobs completed/%of started jobs/Average seconds to complete
|
||||
echo '**'
|
||||
**
|
||||
echo '### --pipepart autoset --block => 10*joblots'
|
||||
### --pipepart autoset --block => 10*joblots
|
||||
seq 1000 > /run/shm/parallel$$; parallel -j2 -k --pipepart echo {#} :::: /run/shm/parallel$$; rm /run/shm/parallel$$
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
10
|
||||
11
|
||||
12
|
||||
13
|
||||
14
|
||||
15
|
||||
16
|
||||
17
|
||||
18
|
||||
19
|
||||
20
|
||||
echo '**'
|
||||
**
|
||||
### 1 .par file from --files expected
|
||||
0
|
||||
|
|
|
@ -846,7 +846,7 @@ _
|
|||
parallel --env _ -S $SERVER1 'echo $VAR; my_func2' ::: bar
|
||||
|
||||
/bin/bash: my_func2: command not found
|
||||
parallel -vv --pipepart wc :::: num30000
|
||||
parallel -vv --pipepart --block 1M wc :::: num30000
|
||||
<num30000 perl -e 'while(@ARGV) { sysseek(STDIN,shift,0) || die; $left = shift; while($read = sysread(STDIN,$buf, ($left > 131072 ? 131072 : $left))){ $left -= $read; syswrite(STDOUT,$buf); } }' 0 0 0 168894 | (wc)
|
||||
30000 30000 168894
|
||||
my_func3() {
|
||||
|
|
Loading…
Reference in a new issue