From f63347b070705da453d2ec1ae076c71a56c30167 Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Thu, 27 Jan 2011 22:05:34 +0100 Subject: [PATCH] --recend and --recstart are now fixed strings instead of regexp for performance reasons --- src/parallel | 69 ++++++++++++++----- testsuite/tests-to-run/test51.sh | 65 ++++++++++++++++-- testsuite/wanted-results/test51 | 112 ++++++++++++++++++++++++++++--- 3 files changed, 215 insertions(+), 31 deletions(-) diff --git a/src/parallel b/src/parallel index 826ea8d0..00079818 100755 --- a/src/parallel +++ b/src/parallel @@ -100,25 +100,52 @@ sub spreadstdin { $recend = $::opt_recend; $recerror = "Warning: --recend unmatched. Is --blocksize too small?"; } - # If $recstart/$recend contains '|' this should only apply to the regexp - $recstart = "(?:".$recstart.")"; - $recend = "(?:".$recend.")"; while(read(STDIN,substr($buf,length $buf,0),$::opt_blocksize)) { # substr above = append to $buf - if($Global::max_number_of_args) { - # -N => (start..*?end){n} - while($buf =~ s/((?:$recstart.*?$recend){$Global::max_number_of_args})($recstart.*)$/$2/os) { - $record = $1; - ::debug("Read record -N: ".length($record)."\n"); - write_record_to_pipe(\$record,$recstart,$recend); + if($::opt_regexp) { + # If $recstart/$recend contains '|' this should only apply to the regexp + $recstart = "(?:".$recstart.")"; + $recend = "(?:".$recend.")"; + if($Global::max_number_of_args) { + # -N => (start..*?end){n} + while($buf =~ s/((?:$recstart.*?$recend){$Global::max_number_of_args})($recstart.*)$/$2/os) { + $record = $1; + ::debug("Read record -N: ".length($record)."\n"); + write_record_to_pipe(\$record,$recstart,$recend); + } + } else { + # Find the last recend-recstart in $buf + if($buf =~ s/(.*$recend)($recstart.*?)$/$2/os) { + $record = $1; + ::debug("Matched record: ".length($record)."/".length($buf)."\n"); + write_record_to_pipe(\$record,$recstart,$recend); + } } } else { - # Find the last recend-recstart in $buf - if($buf =~ s/(.*$recend)($recstart.*?)$/$2/os) { - $record = $1; - ::debug("Matched record: ".length($record)."/".length($buf)."\n"); - write_record_to_pipe(\$record,$recstart,$recend); + # $recstart/$recend = printf strings (\n) + $recstart =~ s/\\([rnt'"\\])/"qq|\\$1|"/gee; + $recend =~ s/\\([rnt'"\\])/"qq|\\$1|"/gee; + if($Global::max_number_of_args) { + # -N => (start..*?end){n} + my $i = 0; + while(($i = nindex(\$buf,$recend.$recstart,$Global::max_number_of_args)) != -1) { + $i += length $recend; # find the actual splitting location + my $record = substr($buf,0,$i); + substr($buf,0,$i) = ""; + ::debug("Read record: ".length($record)."\n"); + write_record_to_pipe(\$record,$recstart,$recend); + } + } else { + # Find the last recend-recstart in $buf + my $i = rindex($buf,$recend.$recstart); + if($i != -1) { + $i += length $recend; # find the actual splitting location + my $record = substr($buf,0,$i); + substr($buf,0,$i) = ""; + ::debug("Read record: ".length($record)."\n"); + write_record_to_pipe(\$record,$recstart,$recend); + } } } } @@ -131,6 +158,18 @@ sub spreadstdin { $Global::start_no_new_jobs = 1; } +sub nindex { + my $buf_ref = shift; + my $str = shift; + my $n = shift; + my $i = 0; + for(1..$n) { + $i = index($$buf_ref,$str,$i+1); + if($i == -1) { last } + } + return $i; +} + sub flush_and_close_pipes { my $flush_done; my $sleep = 0.1; @@ -3978,6 +4017,6 @@ sub unlock { # Keep perl -w happy -$::opt_x = $::opt_workdir = $Semaphore::timeout = $Semaphore::wait = +$::opt_regexp = $::opt_x = $::opt_workdir = $Semaphore::timeout = $Semaphore::wait = $::opt_skip_first_line = $::opt_shebang = 0 ; diff --git a/testsuite/tests-to-run/test51.sh b/testsuite/tests-to-run/test51.sh index 22b59db9..4b19ab09 100644 --- a/testsuite/tests-to-run/test51.sh +++ b/testsuite/tests-to-run/test51.sh @@ -5,12 +5,64 @@ echo '### Test --pipe' seq 1 1000000 >/tmp/parallel-seq shuf --random-source=/tmp/parallel-seq /tmp/parallel-seq >/tmp/blocktest +echo '### Test -N with multiple jobslots and multiple args' +seq 1 1 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo a;sleep 0.1' +seq 1 2 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo bb;sleep 0.1' +seq 1 3 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo ccc;sleep 0.1' +seq 1 4 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo dddd;sleep 0.1' +seq 1 5 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo eeeee;sleep 0.1' +seq 1 6 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo ffffff;sleep 0.1' +seq 1 7 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo ggggggg;sleep 0.1' +seq 1 8 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo hhhhhhhh;sleep 0.1' +seq 1 9 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo iiiiiiiii;sleep 0.1' +seq 1 10 | ../src/parallel -j2 -k -N 3 --pipe 'cat;echo jjjjjjjjjj;sleep 0.1' + +echo '### Test output is the same for different block size' +echo -n 01a02a0a0a12a34a45a6a | + parallel -k -j1 --blocksize 100 --pipe --recend a -N 3 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' +echo -n 01a02a0a0a12a34a45a6a | + parallel -k -j1 --blocksize 1 --pipe --recend a -N 3 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' + +# What is this? +#cat /tmp/blocktest <(echo 'a') /tmp/blocktest <(echo 'a') /tmp/blocktest <(echo 'a') /tmp/blocktest <(echo 'a') /tmp/blocktest | +# parallel -k -j1 --pipe --recend a -N 3 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' | md5sum + +echo '### Test 100M records with too big block' +( + echo start + seq 1 1 | parallel -uj1 cat /tmp/blocktest\;true + echo end + echo start + seq 1 1 | parallel -uj1 cat /tmp/blocktest\;true + echo end + echo start + seq 1 1 | parallel -uj1 cat /tmp/blocktest\;true + echo end +) | stdout parallel -k --block 10M -j2 --pipe --recstart 'start\n' wc -c + + +echo '### Test 300M records with too small block' +( + echo start + seq 1 44 | parallel -uj1 cat /tmp/blocktest\;true + echo end + echo start + seq 1 44 | parallel -uj1 cat /tmp/blocktest\;true + echo end + echo start + seq 1 44 | parallel -uj1 cat /tmp/blocktest\;true + echo end +) | stdout parallel -k --block 200M -j2 --pipe --recend 'end\n' wc -c + + + echo '### Test --rrs -N1 --recend single' echo 12a34a45a6 | parallel -k --pipe --recend a -N1 --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' -echo '### Test --rrs -N1 --recend alternate' -echo 12a34b45a6 | - parallel -k --pipe --recend 'a|b' -N1 --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' +# Broken +#echo '### Test --rrs -N1 --recend alternate' +#echo 12a34b45a6 | +# parallel -k --pipe --recend 'a|b' -N1 --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' echo '### Test --rrs -N1 --recend single' echo 12a34b45a6 | parallel -k --pipe --recend 'b' -N1 --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' @@ -18,9 +70,10 @@ echo 12a34b45a6 | echo '### Test --rrs --recend single' echo 12a34a45a6 | parallel -k --pipe --recend a --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' -echo '### Test --rrs -N1 --recend alternate' -echo 12a34b45a6 | - parallel -k --pipe --recend 'a|b' --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' +# Broken +#echo '### Test --rrs -N1 --recend alternate' +#echo 12a34b45a6 | +# parallel -k --pipe --recend 'a|b' --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' echo '### Test --rrs -N1 --recend single' echo 12a34b45a6 | parallel -k --pipe --recend 'b' --rrs 'echo -n "$PARALLEL_SEQ>"; cat; echo; sleep 0.1' diff --git a/testsuite/wanted-results/test51 b/testsuite/wanted-results/test51 index 7b4bcc7d..a4db584b 100644 --- a/testsuite/wanted-results/test51 +++ b/testsuite/wanted-results/test51 @@ -1,16 +1,112 @@ ### Test --pipe +### Test -N with multiple jobslots and multiple args +1 +a +a +1 +2 +bb +bb +1 +2 +3 +ccc +ccc +1 +2 +3 +dddd +4 +dddd +1 +2 +3 +eeeee +4 +5 +eeeee +1 +2 +3 +ffffff +4 +5 +6 +ffffff +1 +2 +3 +ggggggg +4 +5 +6 +ggggggg +7 +ggggggg +ggggggg +1 +2 +3 +hhhhhhhh +4 +5 +6 +hhhhhhhh +7 +8 +hhhhhhhh +hhhhhhhh +1 +2 +3 +iiiiiiiii +4 +5 +6 +iiiiiiiii +7 +8 +9 +iiiiiiiii +iiiiiiiii +1 +2 +3 +jjjjjjjjjj +4 +5 +6 +jjjjjjjjjj +7 +8 +9 +jjjjjjjjjj +10 +jjjjjjjjjj +### Test output is the same for different block size +1>01a02a0a +2>0a12a34a +3>45a6a +1>01a02a0a +2>0a12a34a +3>45a6a +### Test 100M records with too big block +6888906 +6888906 +6888906 +0 +### Test 300M records with too small block +303111434 +303111434 +303111434 +0 +0 ### Test --rrs -N1 --recend single 1>12 2>34 3>45 4>6 -### Test --rrs -N1 --recend alternate -1>12 -2>34 -3>45 -4>6 - ### Test --rrs -N1 --recend single 1>12a34 2>45a6 @@ -19,10 +115,6 @@ 1>123445 2>6 -### Test --rrs -N1 --recend alternate -1>123445 -2>6 - ### Test --rrs -N1 --recend single 1>12a34 2>45a6