From 34e0b4e1360c7e9bb9d73cbc1684e2e5953ef9bf Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Sun, 27 Sep 2020 16:24:02 +0200 Subject: [PATCH] 2grep: --header implemented. Passes tests. --- 2search/2grep | 254 +++++++++++++++++++++++++++++-------- 2search/2search | 254 +++++++++++++++++++++++++++++-------- 2search/regressiontest | 74 ++++++++++- 2search/regressiontest.out | 195 +++++++++++++++++----------- README | 8 +- 5 files changed, 593 insertions(+), 192 deletions(-) diff --git a/2search/2grep b/2search/2grep index ff5ca56..3d29a24 100755 --- a/2search/2grep +++ b/2search/2grep @@ -8,17 +8,17 @@ =head1 SYNOPSIS -B<2search> [-nrfB] file string [string...] +B<2search> [-nrfHB] file string [string...] -B<2search> --grep [-nrf] file string [string...] +B<2search> --grep [-nrfH] file string [string...] -B<2grep> [-nrf] file string [string...] +B<2grep> [-nrfH] file string [string...] -... | B<2search> [-nrfB] file +... | B<2search> [-nrfHB] file -... | B<2search> --grep [-nrf] file +... | B<2search> --grep [-nrfH] file -... | B<2grep> [-nrf] file +... | B<2grep> [-nrfH] file =head1 DESCRIPTION @@ -52,12 +52,11 @@ print byte position where string would have been consider only blanks and alphanumeric characters -=item B<--debug> (not implemented) +=item B<--debug> =item B<-D> -annotate the part of the line used to sort, and warn about -questionable usage to stderr +annotate the part of the line used to sort to stderr =item B<--ignore-case> @@ -81,6 +80,13 @@ search for all lines in I compare according to general numerical value +=item B<--header> + +=item B<-H> + +treat the first line in I as a header + + =item B<--ignore-nonprinting> (not implemented) =item B<-i> @@ -114,18 +120,20 @@ sort via a key; KEYDEF gives location and type =item B<-n> compare according to string numerical value. If numerical values are -the same: split the string into blocks of numbers and non-numbers, and -compare numbers as numbers and strings as strings. - -This will sort like this: chr3 chr11 3chr 11chr +the same: compare as strings. =item B<--numascii> =item B<-N> -compare according to string numerical value. If numerical values are -the same: compare as strings +split the string into blocks of numbers and non-numbers. For each +block compare the block as numbers, if the numerical values are the +same: compare the block as strings. + +This will sort like this: 3chr 11chr chr3 chr11 + +This is simiar to B<--version-sort>, but without the exceptions. =item B<--random-sort> @@ -152,7 +160,7 @@ B<-M>, numeric B<-n>, random B<-R>, version B<-V> =item B<--field-separator=SEP> -use SEP instead of non-blank to blank transition +use I instead of blanks (\s+). I is a regexp. =item B<-z> @@ -161,39 +169,101 @@ use SEP instead of non-blank to blank transition end lines with 0 byte, not newline + =back =head1 EXAMPLES =head2 Single key -Input is sorted by Chromosome,Position: +Given sorted I like: - SampleID Position Chromosome - foo 10000123 chr3 - foo 10000125 chr3 - foo 9999998 chr11 - foo 10000124 chr11 - foo 10000126 chr11 + A_number B_number Date Duration CellID + 12893827 21034191 2020-03-21T13:38:13 P00:00:20 CPH382 + 12893827 80012345 2020-03-20T12:34:23 P00:00:20 CPH382 + 12893827 80012345 2020-03-20T12:45:03 P00:05:20 CPH382 + 22355591 47827750 2020-03-20T11:28:33 P00:32:27 ALB923 + 22355591 81382631 2020-03-21T21:28:33 P00:12:48 CPH382 + 22356142 45701514 2020-03-20T22:41:23 P00:02:48 CPH022 + 22356142 56818446 2020-03-21T08:38:34 P00:31:24 CPH645 -To find all chr3: +To get all records with 22355591 you can run: - 2grep -n -k3 inputfile chr3 + grep ^22355591 input.txt --n will split 'chr3' into 'chr' which is compared asciibetically and +But if I is several TB big, it can be very slow. B<2grep> +uses binary search which only works if the file is sorted, but takes +less than 1 second to run: + + 2grep -H input.txt 22355591 + +You can also search for a shorter string to get all records starting +with 2235: + + 2grep -H input.txt 2235 + +Or you can search for multiple search strings: + + 2grep -H input.txt 12893827 22356142 + +=head2 Multiple keys + +Input is sorted by SampleID, Chromosome, Position (in that order): + + SampleID Chromosome Position Data + PatientA chr3 10002123 CCGTCTAATGGCTTGATTGGTACACCATGACATTGA + PatientA chr3 10003125 TCCATCGTCGGCGAGAAGGTACCAGGTAA + PatientA chr11 9999998 AATTCACAGTATGGCTGACGGTGTCGTAGCTACACG + PatientA chr11 10001240 TCCAGAAGTTTGA + PatientA chr11 10001260 ATAACGAGAACTTACGTTTTAAAAGGCCTA + PatientB chr3 10000125 GTCTTCACTTTATAAATGGATGATAGCCTTCA + +SampleID is sorted as text. Chromosome is sorted by text first and +numerically for the number. Position is sorted by number. + +To find all chr3 for PatientA: + + 2grep -H -k1,2N inputfile PatientA chr3 + +-N will split 'chr3' into 'chr' which is compared asciibetically and '3' which is compared numerically. -=head2 Not implemented +To find all chr3 for PatientA and all chr3 for PatientB: -To find all lines with chr3,10000125: + 2grep -H -k1,2N inputfile PatientA chr3 PatientB chr3 - 2grep -k3n,2n inputfile chr3 10000125 +=head1 PERFORMANCE + +Binary search requires seeks from the disk. But B<2search> is designed +so that multiple searches will reuse cached data. This means searches +will be faster the more you run. + +You can improve the speed even more by sorting the input strings. This +will make it possible to reuse cached data more. + +It can be even faster if you run multiple searches in parallel. + +This is due to magnetic drives' elevator sorting of requests when +seeking and due to NVMe drives working faster with more queues in +parallel. + + cat searchstrings | parallel -n50 -j10 2grep inputfile + + +=head1 BUGS + +B<2search> does not respect your locale setting. It assumes the input +is sorted with LC_ALL=C. If it is not B<2search> may give the wrong +result. + +To solve this sort the input with B. =head1 REPORTING BUGS -B<2search> is part of tangetools. Report bugs to . +B<2search> is part of tangetools. Report bugs on +https://gitlab.com/ole.tange/tangetools/-/issues =head1 AUTHOR @@ -342,20 +412,32 @@ GetOptions( "sort=s" => \$opt::sort, "V|version-sort" => \$opt::version_sort, "k|key=s" => \@opt::key, + "H|header" => \$opt::header, "t|field-separator=s" => \$opt::field_separator, + "recend|record-end=s" => \$opt::record_end, + "recstart|record-start=s" => \$opt::record_start, "z|zero-terminated" => \$opt::zero_terminated, - ); + ) || exit(255); $Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1]; $Global::version = 20200328; if($opt::version) { version(); exit 0; } if($opt::zero_terminated) { $/ = "\0"; } -if(@opt::key) { +if(@opt::key) { # Default separator if --key = whitespace - $Global::sep = '\s+'; - if(defined $opt::field_separator) { $Global::sep = $opt::field_separator; } + $Global::fieldsep = '\s+'; + if(defined $opt::field_separator) { $Global::fieldsep = $opt::field_separator; } } if($Global::progname eq "2grep") { $opt::grep = 1; } $Global::debug = $opt::D; +if(defined $opt::record_end or defined $opt::record_start) { + if(not defined $opt::record_end) { $opt::record_end = ""; } + if(not defined $opt::record_start) { $opt::record_start = ""; } + $/ = unquote_printf($opt::record_end).unquote_printf($opt::record_start); +} else { + # Default = \n + $opt::record_end = "\n"; + $/ = $opt::record_end; +} parse_keydef(); @@ -370,6 +452,19 @@ if(@ARGV) { $opt::stdin = 1; } +$Global::headersize = 0; +if($opt::header) { + if(not open (my $fh, "<", $file)) { + error("Cannot open '$file'"); + exit 1; + } else { + my $header = <$fh>; + $header =~ s/\Q$opt::record_start\E$//; + $Global::headersize = length $header; + print $header; + } +} + round: while(1) { my @search_vals; @@ -385,7 +480,7 @@ if(@ARGV) { } else { print bsearch($file,@search_vals); } -} +} { my $fh; @@ -447,7 +542,7 @@ sub bgrep { sub bsearch { my $file = shift; my @search_vals = @_; - my $min = 0; + my $min = $Global::headersize; my $max = -s $file; my $fh; if(not open ($fh, "<", $file)) { @@ -474,7 +569,7 @@ sub bsearch { compare(($line = <$fh>),@search_vals) >= 0) { # We have see this newline position before # or we are at the end of the file - # or we should search the upper half + # or we should search the lower half $max = $middle; $maxnl = $newline_pos; } else { @@ -485,19 +580,43 @@ sub bsearch { } seek($fh,$minnl,0) or die("Cannot seek to $minnl"); $line = <$fh>; + my $len = length $opt::record_start; + my $retpos; if(compare($line,@search_vals) >= 0) { - if($opt::byte_offset) { - return $minnl."\n"; - } else { - return $line; - } + # Adjust for length of $recstart + $retpos = $minnl - $len; } else { - if($opt::byte_offset) { - return tell($fh)."\n"; + $retpos = tell($fh) - $len; + } + $retpos = $retpos < 0 ? 0 : $retpos; + if($opt::byte_offset) { + return $retpos."\n"; + } else { + seek($fh,$retpos,0) or die("Cannot seek to $minnl"); + if(length $opt::record_end) { + # read record: A...BA + # Remove $opt::record_start if it is at the end + # (might not be only record) + $line = <$fh>; + $line =~ s/\Q$opt::record_start\E$//; } else { - $line=<$fh>; - return $line; + # --recend == '' + if(length $opt::record_start) { + # read record: A...A + # Remove $opt::record_start if it is at the end + # (might not be only record) + $line = <$fh>; # Read: A + $line .= <$fh>; # Read: ...A + $line =~ s/\Q$opt::record_start\E$//; + } else { + # Len recstart == Len recend = 0. Does this ever happen? + # read record. + # Remove $opt::record_start if it is there (might be only record) + $line = <$fh>; + $line =~ s/\Q$opt::record_start\E$//; + } } + return $line; } } @@ -533,11 +652,11 @@ sub parse_keydef { ); if(@opt::key) { - + # skip } else { # Convert -n -r to -k1rn # with sep = undef - $Global::sep = undef; + $Global::fieldsep = undef; my $opt; $opt->{'field'} = 1; $opt->{'char'} = 1; @@ -546,7 +665,7 @@ sub parse_keydef { } push(@Global::keydefs,$opt); } - + for my $keydefs (@opt::key) { for my $keydef (split /,/, $keydefs) { my $opt; @@ -573,11 +692,11 @@ sub compare { # One key to search for per search column my($line,@search_vals) = @_; chomp($line); - debug("Compare: $line <=> @search_vals "); + debug("Compare: $line <=> @search_vals; "); my @field; - if($Global::sep) { + if($Global::fieldsep) { # Split line - @field = split /$Global::sep/o, $line; + @field = split /$Global::fieldsep/o, $line; } else { @field = ($line); } @@ -628,9 +747,20 @@ sub compare_single { return ($m{$a} || 0) <=> ($m{$b} || 0); } if($opt->{'numeric_sort'}) { - return $a <=> $b; + return($a <=> $b or $a cmp $b); } elsif($opt->{'numascii'}) { - return $a <=> $b or $a cmp $b; + # Split on digit boundary + my @a = split /(?<=\d)(?=\D)|(?<=\D)(?=\d)/i, $a; + my @b = split /(?<=\d)(?=\D)|(?<=\D)(?=\d)/i, $b; + my $c; + for(my $t = 0; + defined $a[$t] and defined $b[$t]; + $t++) { + $c = ($a[$t] <=> $b[$t] or $a[$t] cmp $b[$t]); + $c and return $c; + } + # All parts match, maybe one is longer + return $#a <=> $#b; } else { return $a cmp $b; } @@ -775,3 +905,19 @@ sub debug(@) { $Global::debug or return; print @_; } + +sub unquote_printf() { + # Convert \t \n \r \000 \0 + # Inputs: + # $string = string with \t \n \r \num \0 + # Returns: + # $replaced = string with TAB NEWLINE CR NUL + $_ = shift; + s/\\t/\t/g; + s/\\n/\n/g; + s/\\r/\r/g; + s/\\(\d\d\d)/eval 'sprintf "\\'.$1.'"'/ge; + s/\\(\d)/eval 'sprintf "\\'.$1.'"'/ge; + return $_; +} + diff --git a/2search/2search b/2search/2search index ff5ca56..3d29a24 100755 --- a/2search/2search +++ b/2search/2search @@ -8,17 +8,17 @@ =head1 SYNOPSIS -B<2search> [-nrfB] file string [string...] +B<2search> [-nrfHB] file string [string...] -B<2search> --grep [-nrf] file string [string...] +B<2search> --grep [-nrfH] file string [string...] -B<2grep> [-nrf] file string [string...] +B<2grep> [-nrfH] file string [string...] -... | B<2search> [-nrfB] file +... | B<2search> [-nrfHB] file -... | B<2search> --grep [-nrf] file +... | B<2search> --grep [-nrfH] file -... | B<2grep> [-nrf] file +... | B<2grep> [-nrfH] file =head1 DESCRIPTION @@ -52,12 +52,11 @@ print byte position where string would have been consider only blanks and alphanumeric characters -=item B<--debug> (not implemented) +=item B<--debug> =item B<-D> -annotate the part of the line used to sort, and warn about -questionable usage to stderr +annotate the part of the line used to sort to stderr =item B<--ignore-case> @@ -81,6 +80,13 @@ search for all lines in I compare according to general numerical value +=item B<--header> + +=item B<-H> + +treat the first line in I as a header + + =item B<--ignore-nonprinting> (not implemented) =item B<-i> @@ -114,18 +120,20 @@ sort via a key; KEYDEF gives location and type =item B<-n> compare according to string numerical value. If numerical values are -the same: split the string into blocks of numbers and non-numbers, and -compare numbers as numbers and strings as strings. - -This will sort like this: chr3 chr11 3chr 11chr +the same: compare as strings. =item B<--numascii> =item B<-N> -compare according to string numerical value. If numerical values are -the same: compare as strings +split the string into blocks of numbers and non-numbers. For each +block compare the block as numbers, if the numerical values are the +same: compare the block as strings. + +This will sort like this: 3chr 11chr chr3 chr11 + +This is simiar to B<--version-sort>, but without the exceptions. =item B<--random-sort> @@ -152,7 +160,7 @@ B<-M>, numeric B<-n>, random B<-R>, version B<-V> =item B<--field-separator=SEP> -use SEP instead of non-blank to blank transition +use I instead of blanks (\s+). I is a regexp. =item B<-z> @@ -161,39 +169,101 @@ use SEP instead of non-blank to blank transition end lines with 0 byte, not newline + =back =head1 EXAMPLES =head2 Single key -Input is sorted by Chromosome,Position: +Given sorted I like: - SampleID Position Chromosome - foo 10000123 chr3 - foo 10000125 chr3 - foo 9999998 chr11 - foo 10000124 chr11 - foo 10000126 chr11 + A_number B_number Date Duration CellID + 12893827 21034191 2020-03-21T13:38:13 P00:00:20 CPH382 + 12893827 80012345 2020-03-20T12:34:23 P00:00:20 CPH382 + 12893827 80012345 2020-03-20T12:45:03 P00:05:20 CPH382 + 22355591 47827750 2020-03-20T11:28:33 P00:32:27 ALB923 + 22355591 81382631 2020-03-21T21:28:33 P00:12:48 CPH382 + 22356142 45701514 2020-03-20T22:41:23 P00:02:48 CPH022 + 22356142 56818446 2020-03-21T08:38:34 P00:31:24 CPH645 -To find all chr3: +To get all records with 22355591 you can run: - 2grep -n -k3 inputfile chr3 + grep ^22355591 input.txt --n will split 'chr3' into 'chr' which is compared asciibetically and +But if I is several TB big, it can be very slow. B<2grep> +uses binary search which only works if the file is sorted, but takes +less than 1 second to run: + + 2grep -H input.txt 22355591 + +You can also search for a shorter string to get all records starting +with 2235: + + 2grep -H input.txt 2235 + +Or you can search for multiple search strings: + + 2grep -H input.txt 12893827 22356142 + +=head2 Multiple keys + +Input is sorted by SampleID, Chromosome, Position (in that order): + + SampleID Chromosome Position Data + PatientA chr3 10002123 CCGTCTAATGGCTTGATTGGTACACCATGACATTGA + PatientA chr3 10003125 TCCATCGTCGGCGAGAAGGTACCAGGTAA + PatientA chr11 9999998 AATTCACAGTATGGCTGACGGTGTCGTAGCTACACG + PatientA chr11 10001240 TCCAGAAGTTTGA + PatientA chr11 10001260 ATAACGAGAACTTACGTTTTAAAAGGCCTA + PatientB chr3 10000125 GTCTTCACTTTATAAATGGATGATAGCCTTCA + +SampleID is sorted as text. Chromosome is sorted by text first and +numerically for the number. Position is sorted by number. + +To find all chr3 for PatientA: + + 2grep -H -k1,2N inputfile PatientA chr3 + +-N will split 'chr3' into 'chr' which is compared asciibetically and '3' which is compared numerically. -=head2 Not implemented +To find all chr3 for PatientA and all chr3 for PatientB: -To find all lines with chr3,10000125: + 2grep -H -k1,2N inputfile PatientA chr3 PatientB chr3 - 2grep -k3n,2n inputfile chr3 10000125 +=head1 PERFORMANCE + +Binary search requires seeks from the disk. But B<2search> is designed +so that multiple searches will reuse cached data. This means searches +will be faster the more you run. + +You can improve the speed even more by sorting the input strings. This +will make it possible to reuse cached data more. + +It can be even faster if you run multiple searches in parallel. + +This is due to magnetic drives' elevator sorting of requests when +seeking and due to NVMe drives working faster with more queues in +parallel. + + cat searchstrings | parallel -n50 -j10 2grep inputfile + + +=head1 BUGS + +B<2search> does not respect your locale setting. It assumes the input +is sorted with LC_ALL=C. If it is not B<2search> may give the wrong +result. + +To solve this sort the input with B. =head1 REPORTING BUGS -B<2search> is part of tangetools. Report bugs to . +B<2search> is part of tangetools. Report bugs on +https://gitlab.com/ole.tange/tangetools/-/issues =head1 AUTHOR @@ -342,20 +412,32 @@ GetOptions( "sort=s" => \$opt::sort, "V|version-sort" => \$opt::version_sort, "k|key=s" => \@opt::key, + "H|header" => \$opt::header, "t|field-separator=s" => \$opt::field_separator, + "recend|record-end=s" => \$opt::record_end, + "recstart|record-start=s" => \$opt::record_start, "z|zero-terminated" => \$opt::zero_terminated, - ); + ) || exit(255); $Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1]; $Global::version = 20200328; if($opt::version) { version(); exit 0; } if($opt::zero_terminated) { $/ = "\0"; } -if(@opt::key) { +if(@opt::key) { # Default separator if --key = whitespace - $Global::sep = '\s+'; - if(defined $opt::field_separator) { $Global::sep = $opt::field_separator; } + $Global::fieldsep = '\s+'; + if(defined $opt::field_separator) { $Global::fieldsep = $opt::field_separator; } } if($Global::progname eq "2grep") { $opt::grep = 1; } $Global::debug = $opt::D; +if(defined $opt::record_end or defined $opt::record_start) { + if(not defined $opt::record_end) { $opt::record_end = ""; } + if(not defined $opt::record_start) { $opt::record_start = ""; } + $/ = unquote_printf($opt::record_end).unquote_printf($opt::record_start); +} else { + # Default = \n + $opt::record_end = "\n"; + $/ = $opt::record_end; +} parse_keydef(); @@ -370,6 +452,19 @@ if(@ARGV) { $opt::stdin = 1; } +$Global::headersize = 0; +if($opt::header) { + if(not open (my $fh, "<", $file)) { + error("Cannot open '$file'"); + exit 1; + } else { + my $header = <$fh>; + $header =~ s/\Q$opt::record_start\E$//; + $Global::headersize = length $header; + print $header; + } +} + round: while(1) { my @search_vals; @@ -385,7 +480,7 @@ if(@ARGV) { } else { print bsearch($file,@search_vals); } -} +} { my $fh; @@ -447,7 +542,7 @@ sub bgrep { sub bsearch { my $file = shift; my @search_vals = @_; - my $min = 0; + my $min = $Global::headersize; my $max = -s $file; my $fh; if(not open ($fh, "<", $file)) { @@ -474,7 +569,7 @@ sub bsearch { compare(($line = <$fh>),@search_vals) >= 0) { # We have see this newline position before # or we are at the end of the file - # or we should search the upper half + # or we should search the lower half $max = $middle; $maxnl = $newline_pos; } else { @@ -485,19 +580,43 @@ sub bsearch { } seek($fh,$minnl,0) or die("Cannot seek to $minnl"); $line = <$fh>; + my $len = length $opt::record_start; + my $retpos; if(compare($line,@search_vals) >= 0) { - if($opt::byte_offset) { - return $minnl."\n"; - } else { - return $line; - } + # Adjust for length of $recstart + $retpos = $minnl - $len; } else { - if($opt::byte_offset) { - return tell($fh)."\n"; + $retpos = tell($fh) - $len; + } + $retpos = $retpos < 0 ? 0 : $retpos; + if($opt::byte_offset) { + return $retpos."\n"; + } else { + seek($fh,$retpos,0) or die("Cannot seek to $minnl"); + if(length $opt::record_end) { + # read record: A...BA + # Remove $opt::record_start if it is at the end + # (might not be only record) + $line = <$fh>; + $line =~ s/\Q$opt::record_start\E$//; } else { - $line=<$fh>; - return $line; + # --recend == '' + if(length $opt::record_start) { + # read record: A...A + # Remove $opt::record_start if it is at the end + # (might not be only record) + $line = <$fh>; # Read: A + $line .= <$fh>; # Read: ...A + $line =~ s/\Q$opt::record_start\E$//; + } else { + # Len recstart == Len recend = 0. Does this ever happen? + # read record. + # Remove $opt::record_start if it is there (might be only record) + $line = <$fh>; + $line =~ s/\Q$opt::record_start\E$//; + } } + return $line; } } @@ -533,11 +652,11 @@ sub parse_keydef { ); if(@opt::key) { - + # skip } else { # Convert -n -r to -k1rn # with sep = undef - $Global::sep = undef; + $Global::fieldsep = undef; my $opt; $opt->{'field'} = 1; $opt->{'char'} = 1; @@ -546,7 +665,7 @@ sub parse_keydef { } push(@Global::keydefs,$opt); } - + for my $keydefs (@opt::key) { for my $keydef (split /,/, $keydefs) { my $opt; @@ -573,11 +692,11 @@ sub compare { # One key to search for per search column my($line,@search_vals) = @_; chomp($line); - debug("Compare: $line <=> @search_vals "); + debug("Compare: $line <=> @search_vals; "); my @field; - if($Global::sep) { + if($Global::fieldsep) { # Split line - @field = split /$Global::sep/o, $line; + @field = split /$Global::fieldsep/o, $line; } else { @field = ($line); } @@ -628,9 +747,20 @@ sub compare_single { return ($m{$a} || 0) <=> ($m{$b} || 0); } if($opt->{'numeric_sort'}) { - return $a <=> $b; + return($a <=> $b or $a cmp $b); } elsif($opt->{'numascii'}) { - return $a <=> $b or $a cmp $b; + # Split on digit boundary + my @a = split /(?<=\d)(?=\D)|(?<=\D)(?=\d)/i, $a; + my @b = split /(?<=\d)(?=\D)|(?<=\D)(?=\d)/i, $b; + my $c; + for(my $t = 0; + defined $a[$t] and defined $b[$t]; + $t++) { + $c = ($a[$t] <=> $b[$t] or $a[$t] cmp $b[$t]); + $c and return $c; + } + # All parts match, maybe one is longer + return $#a <=> $#b; } else { return $a cmp $b; } @@ -775,3 +905,19 @@ sub debug(@) { $Global::debug or return; print @_; } + +sub unquote_printf() { + # Convert \t \n \r \000 \0 + # Inputs: + # $string = string with \t \n \r \num \0 + # Returns: + # $replaced = string with TAB NEWLINE CR NUL + $_ = shift; + s/\\t/\t/g; + s/\\n/\n/g; + s/\\r/\r/g; + s/\\(\d\d\d)/eval 'sprintf "\\'.$1.'"'/ge; + s/\\(\d)/eval 'sprintf "\\'.$1.'"'/ge; + return $_; +} + diff --git a/2search/regressiontest b/2search/regressiontest index f3b71a9..819d741 100755 --- a/2search/regressiontest +++ b/2search/regressiontest @@ -2,6 +2,7 @@ test_tmp=`tempfile` export test_tmp +export LC_ALL=C opt_tester() { opt="$@" @@ -111,10 +112,10 @@ test_rn_opt() { } test_r_opt() { - opt_tester -rn + opt_tester -r } -test_k32_2n_1n() { +test_k3N_2N_1n() { tmp=$(tempfile) cat >$tmp <$tmp <sp|O14683|P5I11_HUMAN Tumor protein p53-inducible protein 11 +MIHNYMEHLERTKLHQLSGSDQLESTAHSRIRKERPISLGIFPLPAGDGLLTPDAQKGGET +PGSEQWKFQELSQPRSHTSLKVSNSPEPQKAVEQEDELSDVSQGGSKATTPASTANSDVAT +IPTDTPLKEENEGFVKVTDAPNKSEISKHIEVQVAQETRNVSTGSAENEEKSEVQAIIEST +PELDMDKDLSGYKGSSTPTKGIENKAFDRNTESLFEELSSAGSGLIGDVDEGADLLGMGRE +VENLILENTQLLETKNALNIVKNDLIAKVDELTCEKDVLQGELEAVKQAKLKLEEKNRELE +EELRKARAEAEDARQKAKDDDDSDIPTAQRKRFTRVEMARVLMERNQYKERLMELQEAVRW +TEMIRASRENPAMQEKKRSSIWQFFSRLFSSSSNTTKKPEPPVNLKYNAPTSHVTPSVX +>sp|P04637|P53_HUMAN Cellular +IQVVSRCRLRHTEVLPAEEENDSLGADGT +PQLX +>sp|P10144|GRAB_HUMAN Granzyme B OS=Homo sapiens OX=9606 +MQPILLLLAFLLLPRADAGEIIGGHEAKPHSRPYMAYLMIWDQKSLKRCGGFLIRD +WGSSINVTLGAHNIKEQEPTQQFIPVKRPIPHPAYNPKNFSNDIMLLQLERKAKRT +SNKAQVKPGQTCSVAGWGQTAPLGKHSHTLQEVKMTVQEDRKCEX +>sp|P13674|P4HA1_HUMAN Prolyl +VECCPNCRGTGMQIRIHQIGPGMVQQIQS +DGQKITFHGEGDQEPGLEPGDIIIVLDQK +SHPGQIVKHGDIKCVLNEGMX +>sp|Q06416|P5F1B_HUMAN Putative POU domain, class 5, transc +IVVKGHSTCLSEGALSPDGTVLATASHDGYVKFWQIYIEGQDEPRCLHEWKP +HDGRPLSCLLFCDNHKKQDPDVPFWRFLITGADQNRELKMWCTVSWTCLQTI +RFSPDIFSSVSVPPSLKVCLDLSAEYLILSDVQRKVLYVMELLQNQEEGHAC +FSSISEFLLTHPVLSFGIQVVSRCRLRHTEVLPAEEENDSLGADGTHGAGAX +>sp|Q7Z4N8|P4HA3_HUMAN +MTEQMTLRGTLKGHNGWVTQIA +YGIPQRALRGHSHFVSDVVISS +GHTKDVLSVAFSSDNRQIVSGS +VRFSPNSSNPIIVSX +>sp|Q96A73|P33MX_HUMAN Putative +RNDDDDTSVCLGTRQCSWFAGCTNRTWNSSA +VPLIGLPNTQDYKWVDRNSGLTWSGNDTCLY +SCQNQTKGLLYQLFRNLFCSYGLTEAHGKWR +CADASITNDKGHDGHRTPTWWLTGSNLTLSV +NNSGLFFLCGNGVYKGFPPKWSGRCGLGYLV +PSLTRYLTLNASQITNLRSFIHKVTPHRX +>sp|Q9UHX1|PUF60_HUMAN Poly(U)-binding-splicing +MGKDYYQTLGLARGASDEEIKRAYRRQALRYHPDKNKEPGAEEKFKE +IAEAYDVLSDPRKREIFDRYGEEGLKGSGPSGGSGGGANGTSFSYTF +HGDPHAMFAEFFGGRNPFDTFFGQRNGEEGMDIDDPFSGFPMGMGGF +TNVNFGRSRSAQEPARKKQDPPVTHDLX +EOF + echo "--regstart" + echo start + 2search -t '\|' -k2 --recstart '>' $tmp O14683 + echo middle + 2search -t '\|' -k2 --recstart '>' --recend '' $tmp Q96A73 + echo end + 2search -t '\|' -k2 --recstart '>' $tmp Q9UHX1 + + echo "--regstart + --regend" + echo start + 2search -t '\|' -k2 --recstart '>' --recend '\n' $tmp O14683 + echo middle + 2search -t '\|' -k2 --recstart '>' --recend '\n' $tmp Q96A73 + echo end + 2search -t '\|' -k2 --recstart '>' --recend '\n' $tmp Q9UHX1 + + rm $tmp +} export -f $(compgen -A function | grep test_) compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1' > regressiontest.new -diff regressiontest.new regressiontest.out +diff -Naur regressiontest.new regressiontest.out diff --git a/2search/regressiontest.out b/2search/regressiontest.out index 54e1fe4..4b965c3 100644 --- a/2search/regressiontest.out +++ b/2search/regressiontest.out @@ -1,15 +1,11 @@ -test_k32_2n_1n 111 chr10 Sample 10 -test_k32_2n_1n 1111 chr10 Sample 10 -test_k32_2n_1n 11111 chr10 Sample 10 -test_k32_2n_1n 111111 chr10 Sample 10 +test_k3N_2N_1n 111 chr2 Sample 10 test_n Search in null file test_n 0 test_n 0 test_n 0 test_n 0 test_n Search in newline -test_n -test_n 0 +test_n 1 test_n 1 test_n 1 test_n 1 @@ -65,8 +61,7 @@ test_n_opt 0 test_n_opt 0 test_n_opt Search in newline test_n_opt Search in -test_n_opt -test_n_opt 0 +test_n_opt 1 test_n_opt 1 test_n_opt 1 test_n_opt 1 @@ -150,6 +145,118 @@ test_partial_line 36 test_partial_line 37 test_partial_line 38 test_partial_line 39 +test_r_opt Search in null file +test_r_opt Search in +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt Search in newline +test_r_opt Search in +test_r_opt +test_r_opt +test_r_opt +test_r_opt +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt Search in 1.000000000 +test_r_opt 1.000000000 +test_r_opt 1.000000000 +test_r_opt 1.000000000 +test_r_opt 12 +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt Search in 2 1.000000000 +test_r_opt 2 +test_r_opt 2 +test_r_opt 1.000000000 +test_r_opt 14 +test_r_opt 0 +test_r_opt 0 +test_r_opt 2 +test_r_opt Search in 2.000000000 1 +test_r_opt 1 +test_r_opt 2.000000000 +test_r_opt 1 +test_r_opt 14 +test_r_opt 12 +test_r_opt 0 +test_r_opt 12 +test_r_opt Search in 3 2 1.000000000 +test_r_opt 2 +test_r_opt 2 +test_r_opt 1.000000000 +test_r_opt 16 +test_r_opt 2 +test_r_opt 2 +test_r_opt 4 +test_r_opt Search in 3 2.000000000 1 +test_r_opt 1 +test_r_opt 2.000000000 +test_r_opt 1 +test_r_opt 16 +test_r_opt 14 +test_r_opt 2 +test_r_opt 14 +test_r_opt Search in 3.000000000 2 1 +test_r_opt 2 +test_r_opt 2 +test_r_opt 1 +test_r_opt 16 +test_r_opt 12 +test_r_opt 12 +test_r_opt 14 +test_recstart --regstart +test_recstart start +test_recstart >sp|O14683|P5I11_HUMAN Tumor protein p53-inducible protein 11 +test_recstart MIHNYMEHLERTKLHQLSGSDQLESTAHSRIRKERPISLGIFPLPAGDGLLTPDAQKGGET +test_recstart PGSEQWKFQELSQPRSHTSLKVSNSPEPQKAVEQEDELSDVSQGGSKATTPASTANSDVAT +test_recstart IPTDTPLKEENEGFVKVTDAPNKSEISKHIEVQVAQETRNVSTGSAENEEKSEVQAIIEST +test_recstart PELDMDKDLSGYKGSSTPTKGIENKAFDRNTESLFEELSSAGSGLIGDVDEGADLLGMGRE +test_recstart VENLILENTQLLETKNALNIVKNDLIAKVDELTCEKDVLQGELEAVKQAKLKLEEKNRELE +test_recstart EELRKARAEAEDARQKAKDDDDSDIPTAQRKRFTRVEMARVLMERNQYKERLMELQEAVRW +test_recstart TEMIRASRENPAMQEKKRSSIWQFFSRLFSSSSNTTKKPEPPVNLKYNAPTSHVTPSVX +test_recstart middle +test_recstart >sp|Q96A73|P33MX_HUMAN Putative +test_recstart RNDDDDTSVCLGTRQCSWFAGCTNRTWNSSA +test_recstart VPLIGLPNTQDYKWVDRNSGLTWSGNDTCLY +test_recstart SCQNQTKGLLYQLFRNLFCSYGLTEAHGKWR +test_recstart CADASITNDKGHDGHRTPTWWLTGSNLTLSV +test_recstart NNSGLFFLCGNGVYKGFPPKWSGRCGLGYLV +test_recstart PSLTRYLTLNASQITNLRSFIHKVTPHRX +test_recstart end +test_recstart >sp|Q9UHX1|PUF60_HUMAN Poly(U)-binding-splicing +test_recstart MGKDYYQTLGLARGASDEEIKRAYRRQALRYHPDKNKEPGAEEKFKE +test_recstart IAEAYDVLSDPRKREIFDRYGEEGLKGSGPSGGSGGGANGTSFSYTF +test_recstart HGDPHAMFAEFFGGRNPFDTFFGQRNGEEGMDIDDPFSGFPMGMGGF +test_recstart TNVNFGRSRSAQEPARKKQDPPVTHDLX +test_recstart --regstart + --regend +test_recstart start +test_recstart >sp|O14683|P5I11_HUMAN Tumor protein p53-inducible protein 11 +test_recstart MIHNYMEHLERTKLHQLSGSDQLESTAHSRIRKERPISLGIFPLPAGDGLLTPDAQKGGET +test_recstart PGSEQWKFQELSQPRSHTSLKVSNSPEPQKAVEQEDELSDVSQGGSKATTPASTANSDVAT +test_recstart IPTDTPLKEENEGFVKVTDAPNKSEISKHIEVQVAQETRNVSTGSAENEEKSEVQAIIEST +test_recstart PELDMDKDLSGYKGSSTPTKGIENKAFDRNTESLFEELSSAGSGLIGDVDEGADLLGMGRE +test_recstart VENLILENTQLLETKNALNIVKNDLIAKVDELTCEKDVLQGELEAVKQAKLKLEEKNRELE +test_recstart EELRKARAEAEDARQKAKDDDDSDIPTAQRKRFTRVEMARVLMERNQYKERLMELQEAVRW +test_recstart TEMIRASRENPAMQEKKRSSIWQFFSRLFSSSSNTTKKPEPPVNLKYNAPTSHVTPSVX +test_recstart middle +test_recstart >sp|Q96A73|P33MX_HUMAN Putative +test_recstart RNDDDDTSVCLGTRQCSWFAGCTNRTWNSSA +test_recstart VPLIGLPNTQDYKWVDRNSGLTWSGNDTCLY +test_recstart SCQNQTKGLLYQLFRNLFCSYGLTEAHGKWR +test_recstart CADASITNDKGHDGHRTPTWWLTGSNLTLSV +test_recstart NNSGLFFLCGNGVYKGFPPKWSGRCGLGYLV +test_recstart PSLTRYLTLNASQITNLRSFIHKVTPHRX +test_recstart end +test_recstart >sp|Q9UHX1|PUF60_HUMAN Poly(U)-binding-splicing +test_recstart MGKDYYQTLGLARGASDEEIKRAYRRQALRYHPDKNKEPGAEEKFKE +test_recstart IAEAYDVLSDPRKREIFDRYGEEGLKGSGPSGGSGGGANGTSFSYTF +test_recstart HGDPHAMFAEFFGGRNPFDTFFGQRNGEEGMDIDDPFSGFPMGMGGF +test_recstart TNVNFGRSRSAQEPARKKQDPPVTHDLX test_rn_opt Search in null file test_rn_opt Search in test_rn_opt 0 @@ -183,11 +290,11 @@ test_rn_opt 0 test_rn_opt 0 test_rn_opt 0 test_rn_opt Search in 2.000000000 1 -test_rn_opt 2.000000000 +test_rn_opt 1 test_rn_opt 2.000000000 test_rn_opt 2.000000000 test_rn_opt 14 -test_rn_opt 0 +test_rn_opt 12 test_rn_opt 0 test_rn_opt 0 test_rn_opt Search in 3 2 1.000000000 @@ -199,11 +306,11 @@ test_rn_opt 2 test_rn_opt 2 test_rn_opt 0 test_rn_opt Search in 3 2.000000000 1 -test_rn_opt 2.000000000 +test_rn_opt 1 test_rn_opt 2.000000000 test_rn_opt 3 test_rn_opt 16 -test_rn_opt 2 +test_rn_opt 14 test_rn_opt 2 test_rn_opt 0 test_rn_opt Search in 3.000000000 2 1 @@ -214,67 +321,3 @@ test_rn_opt 16 test_rn_opt 12 test_rn_opt 12 test_rn_opt 0 -test_r_opt Search in null file -test_r_opt Search in -test_r_opt 0 -test_r_opt 0 -test_r_opt 0 -test_r_opt 0 -test_r_opt Search in newline -test_r_opt Search in -test_r_opt -test_r_opt -test_r_opt -test_r_opt -test_r_opt 0 -test_r_opt 0 -test_r_opt 0 -test_r_opt 0 -test_r_opt Search in 1.000000000 -test_r_opt 1.000000000 -test_r_opt 1.000000000 -test_r_opt 1.000000000 -test_r_opt 12 -test_r_opt 0 -test_r_opt 0 -test_r_opt 0 -test_r_opt Search in 2 1.000000000 -test_r_opt 2 -test_r_opt 2 -test_r_opt 2 -test_r_opt 14 -test_r_opt 0 -test_r_opt 0 -test_r_opt 0 -test_r_opt Search in 2.000000000 1 -test_r_opt 2.000000000 -test_r_opt 2.000000000 -test_r_opt 2.000000000 -test_r_opt 14 -test_r_opt 0 -test_r_opt 0 -test_r_opt 0 -test_r_opt Search in 3 2 1.000000000 -test_r_opt 2 -test_r_opt 2 -test_r_opt 3 -test_r_opt 16 -test_r_opt 2 -test_r_opt 2 -test_r_opt 0 -test_r_opt Search in 3 2.000000000 1 -test_r_opt 2.000000000 -test_r_opt 2.000000000 -test_r_opt 3 -test_r_opt 16 -test_r_opt 2 -test_r_opt 2 -test_r_opt 0 -test_r_opt Search in 3.000000000 2 1 -test_r_opt 2 -test_r_opt 2 -test_r_opt 3.000000000 -test_r_opt 16 -test_r_opt 12 -test_r_opt 12 -test_r_opt 0 diff --git a/README b/README index 0914e4a..680a94f 100644 --- a/README +++ b/README @@ -2,9 +2,9 @@ Tools developed by Ole Tange . Probably not useful for you, but then again you never now. -blink - blink disks in a disk enclosure - -bsearch - binary search through sorted text files. +2search - binary search through sorted text files. + +blink - blink disks in a disk enclosure. decrypt-root-with-usb - patch for cryptroot to decrypt root with key on USB. @@ -14,6 +14,8 @@ em - force emacs to run in terminal. Use xemacs if installed. field - split on whitespace. Give the given field number. Supports syntax 1-3,6- +find-first-fail - find the lowest argument that makes a command fail. + forever - run the same command or list of commands every second. G - shorthand for multi level grep.