--recend and --recstart are now fixed strings instead of regexp for performance reasons

This commit is contained in:
Ole Tange 2011-01-27 22:05:34 +01:00
parent b0afcf9fa4
commit f63347b070
3 changed files with 215 additions and 31 deletions

View file

@ -100,12 +100,13 @@ sub spreadstdin {
$recend = $::opt_recend;
$recerror = "Warning: --recend unmatched. Is --blocksize too small?";
}
# If $recstart/$recend contains '|' this should only apply to the regexp
$recstart = "(?:".$recstart.")";
$recend = "(?:".$recend.")";
while(read(STDIN,substr($buf,length $buf,0),$::opt_blocksize)) {
# substr above = append to $buf
if($::opt_regexp) {
# If $recstart/$recend contains '|' this should only apply to the regexp
$recstart = "(?:".$recstart.")";
$recend = "(?:".$recend.")";
if($Global::max_number_of_args) {
# -N => (start..*?end){n}
while($buf =~ s/((?:$recstart.*?$recend){$Global::max_number_of_args})($recstart.*)$/$2/os) {
@ -121,6 +122,32 @@ sub spreadstdin {
write_record_to_pipe(\$record,$recstart,$recend);
}
}
} else {
# $recstart/$recend = printf strings (\n)
$recstart =~ s/\\([rnt'"\\])/"qq|\\$1|"/gee;
$recend =~ s/\\([rnt'"\\])/"qq|\\$1|"/gee;
if($Global::max_number_of_args) {
# -N => (start..*?end){n}
my $i = 0;
while(($i = nindex(\$buf,$recend.$recstart,$Global::max_number_of_args)) != -1) {
$i += length $recend; # find the actual splitting location
my $record = substr($buf,0,$i);
substr($buf,0,$i) = "";
::debug("Read record: ".length($record)."\n");
write_record_to_pipe(\$record,$recstart,$recend);
}
} else {
# Find the last recend-recstart in $buf
my $i = rindex($buf,$recend.$recstart);
if($i != -1) {
$i += length $recend; # find the actual splitting location
my $record = substr($buf,0,$i);
substr($buf,0,$i) = "";
::debug("Read record: ".length($record)."\n");
write_record_to_pipe(\$record,$recstart,$recend);
}
}
}
}
# If there is anything left in the buffer write it
write_record_to_pipe(\$buf,$recstart,$recend);
@ -131,6 +158,18 @@ sub spreadstdin {
$Global::start_no_new_jobs = 1;
}
sub nindex {
my $buf_ref = shift;
my $str = shift;
my $n = shift;
my $i = 0;
for(1..$n) {
$i = index($$buf_ref,$str,$i+1);
if($i == -1) { last }
}
return $i;
}
sub flush_and_close_pipes {
my $flush_done;
my $sleep = 0.1;
@ -3978,6 +4017,6 @@ sub unlock {
# Keep perl -w happy
$::opt_x = $::opt_workdir = $Semaphore::timeout = $Semaphore::wait =
$::opt_regexp = $::opt_x = $::opt_workdir = $Semaphore::timeout = $Semaphore::wait =
$::opt_skip_first_line = $::opt_shebang = 0 ;

View file

@ -5,12 +5,64 @@ echo '### Test --pipe'
seq 1 1000000 >/tmp/parallel-seq
shuf --random-source=/tmp/parallel-seq /tmp/parallel-seq >/tmp/blocktest
echo '### Test -N with multiple jobslots and multiple args'
seq 1 1 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo a;sleep 0.1'
seq 1 2 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo bb;sleep 0.1'
seq 1 3 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo ccc;sleep 0.1'
seq 1 4 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo dddd;sleep 0.1'
seq 1 5 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo eeeee;sleep 0.1'
seq 1 6 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo ffffff;sleep 0.1'
seq 1 7 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo ggggggg;sleep 0.1'
seq 1 8 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo hhhhhhhh;sleep 0.1'
seq 1 9 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo iiiiiiiii;sleep 0.1'
seq 1 10 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo jjjjjjjjjj;sleep 0.1'
echo '### Test output is the same for different block size'
echo -n 01a02a0a0a12a34a45a6a |
parallel -k -j1 --blocksize 100 --pipe --recend a -N 3 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1'
echo -n 01a02a0a0a12a34a45a6a |
parallel -k -j1 --blocksize 1 --pipe --recend a -N 3 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1'
# What is this?
#cat /tmp/blocktest <(echo 'a') /tmp/blocktest <(echo 'a') /tmp/blocktest <(echo 'a') /tmp/blocktest <(echo 'a') /tmp/blocktest |
# parallel -k -j1 --pipe --recend a -N 3 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' | md5sum
echo '### Test 100M records with too big block'
(
echo start
seq 1 1 | parallel -uj1 cat /tmp/blocktest\;true
echo end
echo start
seq 1 1 | parallel -uj1 cat /tmp/blocktest\;true
echo end
echo start
seq 1 1 | parallel -uj1 cat /tmp/blocktest\;true
echo end
) | stdout parallel -k --block 10M -j2 --pipe --recstart 'start\n' wc -c
echo '### Test 300M records with too small block'
(
echo start
seq 1 44 | parallel -uj1 cat /tmp/blocktest\;true
echo end
echo start
seq 1 44 | parallel -uj1 cat /tmp/blocktest\;true
echo end
echo start
seq 1 44 | parallel -uj1 cat /tmp/blocktest\;true
echo end
) | stdout parallel -k --block 200M -j2 --pipe --recend 'end\n' wc -c
echo '### Test --rrs -N1 --recend single'
echo 12a34a45a6 |
parallel -k --pipe --recend a -N1 --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1'
echo '### Test --rrs -N1 --recend alternate'
echo 12a34b45a6 |
parallel -k --pipe --recend 'a|b' -N1 --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1'
# Broken
#echo '### Test --rrs -N1 --recend alternate'
#echo 12a34b45a6 |
# parallel -k --pipe --recend 'a|b' -N1 --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1'
echo '### Test --rrs -N1 --recend single'
echo 12a34b45a6 |
parallel -k --pipe --recend 'b' -N1 --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1'
@ -18,9 +70,10 @@ echo 12a34b45a6 |
echo '### Test --rrs --recend single'
echo 12a34a45a6 |
parallel -k --pipe --recend a --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1'
echo '### Test --rrs -N1 --recend alternate'
echo 12a34b45a6 |
parallel -k --pipe --recend 'a|b' --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1'
# Broken
#echo '### Test --rrs -N1 --recend alternate'
#echo 12a34b45a6 |
# parallel -k --pipe --recend 'a|b' --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1'
echo '### Test --rrs -N1 --recend single'
echo 12a34b45a6 |
parallel -k --pipe --recend 'b' --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1'

View file

@ -1,16 +1,112 @@
### Test --pipe
### Test -N with multiple jobslots and multiple args
1
a
a
1
2
bb
bb
1
2
3
ccc
ccc
1
2
3
dddd
4
dddd
1
2
3
eeeee
4
5
eeeee
1
2
3
ffffff
4
5
6
ffffff
1
2
3
ggggggg
4
5
6
ggggggg
7
ggggggg
ggggggg
1
2
3
hhhhhhhh
4
5
6
hhhhhhhh
7
8
hhhhhhhh
hhhhhhhh
1
2
3
iiiiiiiii
4
5
6
iiiiiiiii
7
8
9
iiiiiiiii
iiiiiiiii
1
2
3
jjjjjjjjjj
4
5
6
jjjjjjjjjj
7
8
9
jjjjjjjjjj
10
jjjjjjjjjj
### Test output is the same for different block size
1>01a02a0a
2>0a12a34a
3>45a6a
1>01a02a0a
2>0a12a34a
3>45a6a
### Test 100M records with too big block
6888906
6888906
6888906
0
### Test 300M records with too small block
303111434
303111434
303111434
0
0
### Test --rrs -N1 --recend single
1>12
2>34
3>45
4>6
### Test --rrs -N1 --recend alternate
1>12
2>34
3>45
4>6
### Test --rrs -N1 --recend single
1>12a34
2>45a6
@ -19,10 +115,6 @@
1>123445
2>6
### Test --rrs -N1 --recend alternate
1>123445
2>6
### Test --rrs -N1 --recend single
1>12a34
2>45a6