2search: bsearch/bgrep renamed to 2search/2grep (bgrep is used by others).
This commit is contained in:
parent
9efd18d0fc
commit
e8f520f642
777
2search/2grep
Executable file
777
2search/2grep
Executable file
|
@ -0,0 +1,777 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
=head1 NAME
|
||||
|
||||
2search - binary search through sorted text files
|
||||
|
||||
2grep - binary search+grep through sorted text files
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
B<2search> [-nrfB] file string [string...]
|
||||
|
||||
B<2search> --grep [-nrf] file string [string...]
|
||||
|
||||
B<2grep> [-nrf] file string [string...]
|
||||
|
||||
... | B<2search> [-nrfB] file
|
||||
|
||||
... | B<2search> --grep [-nrf] file
|
||||
|
||||
... | B<2grep> [-nrf] file
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
B<2search> searches a sorted file for a string. It outputs the
|
||||
following line or the byte position of this line, which is where the
|
||||
string would have been if it had been in the sorted file.
|
||||
|
||||
B<2grep> output all lines starting with a given string. The file must
|
||||
be sorted.
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<--ignore-leading-blanks>
|
||||
|
||||
=item B<-b>
|
||||
|
||||
ignore leading blanks
|
||||
|
||||
|
||||
=item B<--byte-offset>
|
||||
|
||||
=item B<-B>
|
||||
|
||||
print byte position where string would have been
|
||||
|
||||
|
||||
=item B<--dictionary-order> (not implemented)
|
||||
|
||||
=item B<-d>
|
||||
|
||||
consider only blanks and alphanumeric characters
|
||||
|
||||
|
||||
=item B<--debug> (not implemented)
|
||||
|
||||
=item B<-D>
|
||||
|
||||
annotate the part of the line used to sort, and warn about
|
||||
questionable usage to stderr
|
||||
|
||||
|
||||
=item B<--ignore-case>
|
||||
|
||||
=item B<-f>
|
||||
|
||||
fold lower case to upper case characters
|
||||
|
||||
|
||||
=item B<--file> I<file>
|
||||
|
||||
=item B<-F> I<file>
|
||||
|
||||
search for all lines in I<file>
|
||||
|
||||
|
||||
=item B<--general-numeric-sort> (not implemented)
|
||||
|
||||
=item B<-g>
|
||||
|
||||
compare according to general numerical value
|
||||
|
||||
|
||||
=item B<--ignore-nonprinting> (not implemented)
|
||||
|
||||
=item B<-i>
|
||||
|
||||
consider only printable characters
|
||||
|
||||
|
||||
=item B<--month-sort>
|
||||
|
||||
=item B<-M>
|
||||
|
||||
compare (unknown) < 'JAN' < ... < 'DEC'
|
||||
|
||||
|
||||
=item B<--human-numeric-sort>
|
||||
|
||||
=item B<-h>
|
||||
|
||||
compare human readable numbers (e.g., 2K 1G)
|
||||
|
||||
|
||||
=item B<--key=KEYDEF> (not implemented)
|
||||
|
||||
=item B<-k>
|
||||
|
||||
sort via a key; KEYDEF gives location and type
|
||||
|
||||
|
||||
=item B<--numeric-sort>
|
||||
|
||||
=item B<-n>
|
||||
|
||||
compare according to string numerical value. If numerical values are
|
||||
the same: split the string into blocks of numbers and non-numbers, and
|
||||
compare numbers as numbers and strings as strings.
|
||||
|
||||
This will sort like this: chr3 chr11 3chr 11chr
|
||||
|
||||
|
||||
=item B<--numascii>
|
||||
|
||||
=item B<-N>
|
||||
|
||||
compare according to string numerical value. If numerical values are
|
||||
the same: compare as strings
|
||||
|
||||
|
||||
=item B<--random-sort>
|
||||
|
||||
=item B<-R>
|
||||
|
||||
sort by random hash of keys
|
||||
|
||||
|
||||
=item B<--reverse>
|
||||
|
||||
=item B<-r>
|
||||
|
||||
reverse the result of comparisons
|
||||
|
||||
|
||||
=item B<--sort=WORD> (not implemented)
|
||||
|
||||
sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month
|
||||
B<-M>, numeric B<-n>, random B<-R>, version B<-V>
|
||||
|
||||
|
||||
=item B<-t>
|
||||
|
||||
=item B<--field-separator=SEP>
|
||||
|
||||
use SEP instead of non-blank to blank transition
|
||||
|
||||
|
||||
=item B<-z>
|
||||
|
||||
=item B<--zero-terminated>
|
||||
|
||||
end lines with 0 byte, not newline
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
=head2 Single key
|
||||
|
||||
Input is sorted by Chromosome,Position:
|
||||
|
||||
SampleID Position Chromosome
|
||||
foo 10000123 chr3
|
||||
foo 10000125 chr3
|
||||
foo 9999998 chr11
|
||||
foo 10000124 chr11
|
||||
foo 10000126 chr11
|
||||
|
||||
To find all chr3:
|
||||
|
||||
2grep -n -k3 inputfile chr3
|
||||
|
||||
-n will split 'chr3' into 'chr' which is compared asciibetically and
|
||||
'3' which is compared numerically.
|
||||
|
||||
=head2 Not implemented
|
||||
|
||||
To find all lines with chr3,10000125:
|
||||
|
||||
2grep -k3n,2n inputfile chr3 10000125
|
||||
|
||||
|
||||
|
||||
=head1 REPORTING BUGS
|
||||
|
||||
B<2search> is part of tangetools. Report bugs to <tools@tange.dk>.
|
||||
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Copyright (C) 2016-2020 Ole Tange http://ole.tange.dk
|
||||
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
Copyright (C) 2013 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
at your option any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
=head2 Documentation license I
|
||||
|
||||
Permission is granted to copy, distribute and/or modify this documentation
|
||||
under the terms of the GNU Free Documentation License, Version 1.3 or
|
||||
any later version published by the Free Software Foundation; with no
|
||||
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
|
||||
Texts. A copy of the license is included in the file fdl.txt.
|
||||
|
||||
=head2 Documentation license II
|
||||
|
||||
You are free:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<to Share>
|
||||
|
||||
to copy, distribute and transmit the work
|
||||
|
||||
=item B<to Remix>
|
||||
|
||||
to adapt the work
|
||||
|
||||
=back
|
||||
|
||||
Under the following conditions:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Attribution>
|
||||
|
||||
You must attribute the work in the manner specified by the author or
|
||||
licensor (but not in any way that suggests that they endorse you or
|
||||
your use of the work).
|
||||
|
||||
=item B<Share Alike>
|
||||
|
||||
If you alter, transform, or build upon this work, you may distribute
|
||||
the resulting work only under the same, similar or a compatible
|
||||
license.
|
||||
|
||||
=back
|
||||
|
||||
With the understanding that:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Waiver>
|
||||
|
||||
Any of the above conditions can be waived if you get permission from
|
||||
the copyright holder.
|
||||
|
||||
=item B<Public Domain>
|
||||
|
||||
Where the work or any of its elements is in the public domain under
|
||||
applicable law, that status is in no way affected by the license.
|
||||
|
||||
=item B<Other Rights>
|
||||
|
||||
In no way are any of the following rights affected by the license:
|
||||
|
||||
=over 9
|
||||
|
||||
=item *
|
||||
|
||||
Your fair dealing or fair use rights, or other applicable
|
||||
copyright exceptions and limitations;
|
||||
|
||||
=item *
|
||||
|
||||
The author's moral rights;
|
||||
|
||||
=item *
|
||||
|
||||
Rights other persons may have either in the work itself or in
|
||||
how the work is used, such as publicity or privacy rights.
|
||||
|
||||
=back
|
||||
|
||||
=item B<Notice>
|
||||
|
||||
For any reuse or distribution, you must make clear to others the
|
||||
license terms of this work.
|
||||
|
||||
=back
|
||||
|
||||
A copy of the full license is included in the file as cc-by-sa.txt.
|
||||
|
||||
=head1 DEPENDENCIES
|
||||
|
||||
B<2search>/B<2grep> uses Perl.
|
||||
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
B<grep>(1), B<sort>(1).
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
Getopt::Long::Configure("bundling","require_order");
|
||||
|
||||
GetOptions(
|
||||
"debug|D" => \$opt::D,
|
||||
"version" => \$opt::version,
|
||||
"verbose|v" => \$opt::verbose,
|
||||
"B|byte-offset" => \$opt::byte_offset,
|
||||
"b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
|
||||
"d|dictionary-order" => \$opt::dictionary_order,
|
||||
"f|ignore-case" => \$opt::ignore_case,
|
||||
"g|general-numeric-sort" => \$opt::general_numeric_sort,
|
||||
"G|grep" => \$opt::grep,
|
||||
"F|file=s" => \$opt::file,
|
||||
"i|ignore-nonprinting" => \$opt::ignore_nonprinting,
|
||||
"M|month-sort" => \$opt::month_sort,
|
||||
"h|human-numeric-sort" => \$opt::human_numeric_sort,
|
||||
"n|numeric-sort" => \$opt::numeric_sort,
|
||||
"N|numascii" => \$opt::numascii,
|
||||
"r|reverse" => \$opt::reverse,
|
||||
"R|random-sort" => \$opt::random_sort,
|
||||
"sort=s" => \$opt::sort,
|
||||
"V|version-sort" => \$opt::version_sort,
|
||||
"k|key=s" => \@opt::key,
|
||||
"t|field-separator=s" => \$opt::field_separator,
|
||||
"z|zero-terminated" => \$opt::zero_terminated,
|
||||
);
|
||||
$Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
|
||||
$Global::version = 20200328;
|
||||
if($opt::version) { version(); exit 0; }
|
||||
if($opt::zero_terminated) { $/ = "\0"; }
|
||||
if(@opt::key) {
|
||||
# Default separator if --key = whitespace
|
||||
$Global::sep = '\s+';
|
||||
if(defined $opt::field_separator) { $Global::sep = $opt::field_separator; }
|
||||
}
|
||||
if($Global::progname eq "2grep") { $opt::grep = 1; }
|
||||
$Global::debug = $opt::D;
|
||||
|
||||
parse_keydef();
|
||||
|
||||
debug(my_dump(\@Global::keydefs),"\n");
|
||||
|
||||
my $file = shift;
|
||||
if(@ARGV) {
|
||||
$opt::argv = 1;
|
||||
} elsif(defined $opt::file) {
|
||||
# skip
|
||||
} else {
|
||||
$opt::stdin = 1;
|
||||
}
|
||||
|
||||
round:
|
||||
while(1) {
|
||||
my @search_vals;
|
||||
for(@Global::keydefs) {
|
||||
my $val = get();
|
||||
if(not defined $val) {
|
||||
last round;
|
||||
}
|
||||
push @search_vals, $val;
|
||||
}
|
||||
if($opt::grep) {
|
||||
bgrep($file,@search_vals);
|
||||
} else {
|
||||
print bsearch($file,@search_vals);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
my $fh;
|
||||
|
||||
sub get {
|
||||
if($opt::argv) {
|
||||
# Search for strings on the command line
|
||||
return shift @ARGV;
|
||||
}
|
||||
if($opt::file) {
|
||||
# Search for strings given with --file
|
||||
if(not $fh) {
|
||||
if(not open(my $fh, "<", $opt::file)) {
|
||||
error("Cannot open $opt::file");
|
||||
exit(255);
|
||||
}
|
||||
}
|
||||
my $val = <$fh>;
|
||||
chomp $val;
|
||||
return $val;
|
||||
}
|
||||
if($opt::stdin) {
|
||||
# Search for strings on stdin
|
||||
my $val = <>;
|
||||
chomp $val;
|
||||
return $val;
|
||||
}
|
||||
die;
|
||||
}
|
||||
}
|
||||
|
||||
sub bgrep {
|
||||
my $file = shift;
|
||||
my @search_vals = @_;
|
||||
$opt::byte_offset = 1;
|
||||
my $startpos = bsearch($file,@search_vals);
|
||||
my $fh;
|
||||
if(not open ($fh, "<", $file)) {
|
||||
error("Cannot open '$file'");
|
||||
exit 1;
|
||||
}
|
||||
seek($fh,$startpos,0) or die;
|
||||
# Allow for partial matches in grep (4 mathes 40, A matches Aaa)
|
||||
for my $keydef (@Global::keydefs) {
|
||||
$keydef->{'partial_match'} = 1;
|
||||
}
|
||||
my $line;
|
||||
while($line = <$fh>
|
||||
and
|
||||
not compare($line,@search_vals)) {
|
||||
print $line;
|
||||
}
|
||||
close $fh;
|
||||
for my $keydef (@Global::keydefs) {
|
||||
$keydef->{'partial_match'} = 0;
|
||||
}
|
||||
}
|
||||
|
||||
sub bsearch {
|
||||
my $file = shift;
|
||||
my @search_vals = @_;
|
||||
my $min = 0;
|
||||
my $max = -s $file;
|
||||
my $fh;
|
||||
if(not open ($fh, "<", $file)) {
|
||||
error("Cannot open '$file'");
|
||||
exit 1;
|
||||
}
|
||||
my($line,$middle);
|
||||
my $minnl = $min;
|
||||
my $maxnl = $max;
|
||||
while($max - $min > 1) {
|
||||
$middle = int(($max + $min)/2);
|
||||
seek($fh,$middle,0) or die("Cannot seek to $middle");
|
||||
if($middle > 0) {
|
||||
# Read last half of a line
|
||||
<$fh>;
|
||||
}
|
||||
my $newline_pos = tell($fh);
|
||||
debug("$min <= $middle <= $newline_pos <= $max\n");
|
||||
debug("$minnl <= $newline_pos <= $maxnl\n");
|
||||
if($newline_pos == $maxnl
|
||||
or
|
||||
eof($fh)
|
||||
or
|
||||
compare(($line = <$fh>),@search_vals) >= 0) {
|
||||
# We have see this newline position before
|
||||
# or we are at the end of the file
|
||||
# or we should search the upper half
|
||||
$max = $middle;
|
||||
$maxnl = $newline_pos;
|
||||
} else {
|
||||
# We should search the upper half
|
||||
$min = $middle;
|
||||
$minnl = $newline_pos;
|
||||
}
|
||||
}
|
||||
seek($fh,$minnl,0) or die("Cannot seek to $minnl");
|
||||
$line = <$fh>;
|
||||
if(compare($line,@search_vals) >= 0) {
|
||||
if($opt::byte_offset) {
|
||||
return $minnl."\n";
|
||||
} else {
|
||||
return $line;
|
||||
}
|
||||
} else {
|
||||
if($opt::byte_offset) {
|
||||
return tell($fh)."\n";
|
||||
} else {
|
||||
$line=<$fh>;
|
||||
return $line;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub parse_keydef {
|
||||
# parse keydef F[.C][OPTS][,F[.C][OPTS]]
|
||||
my %defaultorder = (
|
||||
"b" => $opt::ignore_leading_blanks,
|
||||
"d" => $opt::dictionary_order,
|
||||
"f" => $opt::ignore_case,
|
||||
"g" => $opt::general_numeric_sort,
|
||||
"i" => $opt::ignore_nonprinting,
|
||||
"M" => $opt::month_sort,
|
||||
"h" => $opt::human_numeric_sort,
|
||||
"n" => $opt::numeric_sort,
|
||||
"N" => $opt::numascii,
|
||||
"r" => $opt::reverse,
|
||||
"R" => $opt::random_sort,
|
||||
"V" => $opt::version_sort,
|
||||
);
|
||||
my %ordertbl = (
|
||||
"b" => 'ignore_leading_blanks',
|
||||
"d" => 'dictionary_order',
|
||||
"f" => 'ignore_case',
|
||||
"g" => 'general_numeric_sort',
|
||||
"i" => 'ignore_nonprinting',
|
||||
"M" => 'month_sort',
|
||||
"h" => 'human_numeric_sort',
|
||||
"n" => 'numeric_sort',
|
||||
"N" => 'numascii',
|
||||
"r" => 'reverse',
|
||||
"R" => 'random_sort',
|
||||
"V" => 'version_sort',
|
||||
);
|
||||
|
||||
if(@opt::key) {
|
||||
|
||||
} else {
|
||||
# Convert -n -r to -k1rn
|
||||
# with sep = undef
|
||||
$Global::sep = undef;
|
||||
my $opt;
|
||||
$opt->{'field'} = 1;
|
||||
$opt->{'char'} = 1;
|
||||
for (keys %defaultorder) {
|
||||
$opt->{$ordertbl{$_}} = $defaultorder{$_};
|
||||
}
|
||||
push(@Global::keydefs,$opt);
|
||||
}
|
||||
|
||||
for my $keydefs (@opt::key) {
|
||||
for my $keydef (split /,/, $keydefs) {
|
||||
my $opt;
|
||||
if($keydef =~ /^(\d+)(\.(\d+))?([bdfgiMhnNRrV]+)?$/) {
|
||||
# parse keydef F[.C][OPTS][,F[.C][OPTS]]
|
||||
$opt->{'field'} = $1;
|
||||
$opt->{'char'} = $3 || 1;
|
||||
for (keys %defaultorder) {
|
||||
$opt->{$ordertbl{$_}} = $defaultorder{$_};
|
||||
}
|
||||
for my $o (split //, $4) {
|
||||
$opt->{$ordertbl{$o}} = 1;
|
||||
}
|
||||
} else {
|
||||
error("Keydef $keydef does not match F[.C][OPTS]");
|
||||
exit(255);
|
||||
}
|
||||
push(@Global::keydefs,$opt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub compare {
|
||||
# One key to search for per search column
|
||||
my($line,@search_vals) = @_;
|
||||
chomp($line);
|
||||
debug("Compare: $line <=> @search_vals ");
|
||||
my @field;
|
||||
if($Global::sep) {
|
||||
# Split line
|
||||
@field = split /$Global::sep/o, $line;
|
||||
} else {
|
||||
@field = ($line);
|
||||
}
|
||||
my @tmp_vals = @search_vals;
|
||||
for my $keydef (@Global::keydefs) {
|
||||
# keydef = F[.C][OPTS][,F[.C][OPTS]]
|
||||
my $f = $keydef->{'field'};
|
||||
my $c = $keydef->{'char'};
|
||||
my $cmp = compare_single(substr($field[$f-1],$c-1),shift @tmp_vals,$keydef);
|
||||
# They differ on this key
|
||||
debug("== $cmp\n");
|
||||
if($cmp) { return $cmp; }
|
||||
}
|
||||
# No difference on any keydefs
|
||||
return 0;
|
||||
}
|
||||
|
||||
sub compare_single {
|
||||
# Compare two lines based on order options
|
||||
my ($a,$b,$opt) = @_;
|
||||
debug("$a <=> $b");
|
||||
debug(my_dump($opt),"\n");
|
||||
if($opt->{'random_sort'}) {
|
||||
return rand() <=> rand();
|
||||
}
|
||||
if($opt->{'ignore_leading_blanks'}) {
|
||||
$a =~ s/^\s+//;
|
||||
$b =~ s/^\s+//;
|
||||
}
|
||||
if($opt->{'ignore_case'}) {
|
||||
$a = uc($a);
|
||||
$b = uc($b);
|
||||
}
|
||||
if($opt->{'partial_match'}) {
|
||||
# String 'foo' matches 'foobar'
|
||||
$a = substr($a,0,length $b);
|
||||
}
|
||||
if($opt->{'reverse'}) {
|
||||
($a,$b) = ($b,$a);
|
||||
}
|
||||
if($opt->{'human_numeric_sort'}) {
|
||||
return multiply_binary_prefix($a) <=> multiply_binary_prefix($b);
|
||||
}
|
||||
if($opt->{'month_sort'}) {
|
||||
my %m;
|
||||
my @mon = qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC);
|
||||
@m{@mon}={1..12};
|
||||
return ($m{$a} || 0) <=> ($m{$b} || 0);
|
||||
}
|
||||
if($opt->{'numeric_sort'}) {
|
||||
return $a <=> $b;
|
||||
} elsif($opt->{'numascii'}) {
|
||||
return $a <=> $b or $a cmp $b;
|
||||
} else {
|
||||
return $a cmp $b;
|
||||
}
|
||||
}
|
||||
|
||||
sub multiply_binary_prefix(@) {
|
||||
# Evalualte numbers with binary prefix
|
||||
# Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80
|
||||
# ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80
|
||||
# K =2^10, M =2^20, G =2^30, T =2^40, P =2^50, E =2^70, Z =2^80, Y =2^80
|
||||
# k =10^3, m =10^6, g =10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24
|
||||
# 13G = 13*1024*1024*1024 = 13958643712
|
||||
# Input:
|
||||
# $s = string with prefixes
|
||||
# Returns:
|
||||
# $value = int with prefixes multiplied
|
||||
my @v = @_;
|
||||
for(@v) {
|
||||
# 1E3=1000, 1E-3=0.001
|
||||
s/e([+-]?\d+)/*10**$1/gi;
|
||||
}
|
||||
for(@v) {
|
||||
defined $_ or next;
|
||||
s/ki/*1024/gi;
|
||||
s/mi/*1024*1024/gi;
|
||||
s/gi/*1024*1024*1024/gi;
|
||||
s/ti/*1024*1024*1024*1024/gi;
|
||||
s/pi/*1024*1024*1024*1024*1024/gi;
|
||||
s/ei/*1024*1024*1024*1024*1024*1024/gi;
|
||||
s/zi/*1024*1024*1024*1024*1024*1024*1024/gi;
|
||||
s/yi/*1024*1024*1024*1024*1024*1024*1024*1024/gi;
|
||||
s/xi/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi;
|
||||
|
||||
s/K/*1024/g;
|
||||
s/M/*1024*1024/g;
|
||||
s/G/*1024*1024*1024/g;
|
||||
s/T/*1024*1024*1024*1024/g;
|
||||
s/P/*1024*1024*1024*1024*1024/g;
|
||||
s/E/*1024*1024*1024*1024*1024*1024/g;
|
||||
s/Z/*1024*1024*1024*1024*1024*1024*1024/g;
|
||||
s/Y/*1024*1024*1024*1024*1024*1024*1024*1024/g;
|
||||
s/X/*1024*1024*1024*1024*1024*1024*1024*1024*1024/g;
|
||||
|
||||
s/k/*1000/g;
|
||||
s/m/*1000*1000/g;
|
||||
s/g/*1000*1000*1000/g;
|
||||
s/t/*1000*1000*1000*1000/g;
|
||||
s/p/*1000*1000*1000*1000*1000/g;
|
||||
s/e/*1000*1000*1000*1000*1000*1000/g;
|
||||
s/z/*1000*1000*1000*1000*1000*1000*1000/g;
|
||||
s/y/*1000*1000*1000*1000*1000*1000*1000*1000/g;
|
||||
s/x/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g;
|
||||
|
||||
$_ = eval $_;
|
||||
}
|
||||
return wantarray ? @v : $v[0];
|
||||
}
|
||||
|
||||
sub status {
|
||||
my @w = @_;
|
||||
my $fh = $Global::status_fd || *STDERR;
|
||||
print $fh map { ($_, "\n") } @w;
|
||||
flush $fh;
|
||||
}
|
||||
|
||||
sub status_no_nl {
|
||||
my @w = @_;
|
||||
my $fh = $Global::status_fd || *STDERR;
|
||||
print $fh @w;
|
||||
flush $fh;
|
||||
}
|
||||
|
||||
sub warning {
|
||||
my @w = @_;
|
||||
my $prog = $Global::progname || "parallel";
|
||||
status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
|
||||
}
|
||||
|
||||
sub error {
|
||||
my @w = @_;
|
||||
my $prog = $Global::progname || "parallel";
|
||||
status(map { ($prog.": Error: ". $_); } @w);
|
||||
}
|
||||
|
||||
sub die_bug {
|
||||
my $bugid = shift;
|
||||
print STDERR
|
||||
("$Global::progname: This should not happen. You have found a bug.\n",
|
||||
"Please submit a bug at https://gitlab.com/ole.tange/tangetools/-/issues\n",
|
||||
"and include:\n",
|
||||
"* The version number: $Global::version\n",
|
||||
"* The bugid: $bugid\n",
|
||||
"* The command line being run\n",
|
||||
"* The files being read (put the files on a webserver if they are big)\n",
|
||||
"\n",
|
||||
"If you get the error on smaller/fewer files, please include those instead.\n");
|
||||
exit(255);
|
||||
}
|
||||
|
||||
sub version {
|
||||
# Returns: N/A
|
||||
print join("\n",
|
||||
"$Global::progname $Global::version",
|
||||
"Copyright (C) 2016-2020",
|
||||
"Ole Tange and Free Software Foundation, Inc.",
|
||||
"License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
|
||||
"This is free software: you are free to change and redistribute it.",
|
||||
"$Global::progname comes with no warranty.",
|
||||
"",
|
||||
"Web site: https://gitlab.com/ole.tange/tangetools/\n",
|
||||
);
|
||||
}
|
||||
|
||||
sub my_dump(@) {
|
||||
# Returns:
|
||||
# ascii expression of object if Data::Dump(er) is installed
|
||||
# error code otherwise
|
||||
my @dump_this = (@_);
|
||||
eval "use Data::Dump qw(dump);";
|
||||
if ($@) {
|
||||
# Data::Dump not installed
|
||||
eval "use Data::Dumper;";
|
||||
if ($@) {
|
||||
my $err = "Neither Data::Dump nor Data::Dumper is installed\n".
|
||||
"Not dumping output\n";
|
||||
::status($err);
|
||||
return $err;
|
||||
} else {
|
||||
return Dumper(@dump_this);
|
||||
}
|
||||
} else {
|
||||
# Create a dummy Data::Dump:dump as Hans Schou sometimes has
|
||||
# it undefined
|
||||
eval "sub Data::Dump:dump {}";
|
||||
eval "use Data::Dump qw(dump);";
|
||||
return (Data::Dump::dump(@dump_this));
|
||||
}
|
||||
}
|
||||
|
||||
sub debug(@) {
|
||||
# Returns: N/A
|
||||
$Global::debug or return;
|
||||
print @_;
|
||||
}
|
777
2search/2search
Executable file
777
2search/2search
Executable file
|
@ -0,0 +1,777 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
=head1 NAME
|
||||
|
||||
2search - binary search through sorted text files
|
||||
|
||||
2grep - binary search+grep through sorted text files
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
B<2search> [-nrfB] file string [string...]
|
||||
|
||||
B<2search> --grep [-nrf] file string [string...]
|
||||
|
||||
B<2grep> [-nrf] file string [string...]
|
||||
|
||||
... | B<2search> [-nrfB] file
|
||||
|
||||
... | B<2search> --grep [-nrf] file
|
||||
|
||||
... | B<2grep> [-nrf] file
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
B<2search> searches a sorted file for a string. It outputs the
|
||||
following line or the byte position of this line, which is where the
|
||||
string would have been if it had been in the sorted file.
|
||||
|
||||
B<2grep> output all lines starting with a given string. The file must
|
||||
be sorted.
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<--ignore-leading-blanks>
|
||||
|
||||
=item B<-b>
|
||||
|
||||
ignore leading blanks
|
||||
|
||||
|
||||
=item B<--byte-offset>
|
||||
|
||||
=item B<-B>
|
||||
|
||||
print byte position where string would have been
|
||||
|
||||
|
||||
=item B<--dictionary-order> (not implemented)
|
||||
|
||||
=item B<-d>
|
||||
|
||||
consider only blanks and alphanumeric characters
|
||||
|
||||
|
||||
=item B<--debug> (not implemented)
|
||||
|
||||
=item B<-D>
|
||||
|
||||
annotate the part of the line used to sort, and warn about
|
||||
questionable usage to stderr
|
||||
|
||||
|
||||
=item B<--ignore-case>
|
||||
|
||||
=item B<-f>
|
||||
|
||||
fold lower case to upper case characters
|
||||
|
||||
|
||||
=item B<--file> I<file>
|
||||
|
||||
=item B<-F> I<file>
|
||||
|
||||
search for all lines in I<file>
|
||||
|
||||
|
||||
=item B<--general-numeric-sort> (not implemented)
|
||||
|
||||
=item B<-g>
|
||||
|
||||
compare according to general numerical value
|
||||
|
||||
|
||||
=item B<--ignore-nonprinting> (not implemented)
|
||||
|
||||
=item B<-i>
|
||||
|
||||
consider only printable characters
|
||||
|
||||
|
||||
=item B<--month-sort>
|
||||
|
||||
=item B<-M>
|
||||
|
||||
compare (unknown) < 'JAN' < ... < 'DEC'
|
||||
|
||||
|
||||
=item B<--human-numeric-sort>
|
||||
|
||||
=item B<-h>
|
||||
|
||||
compare human readable numbers (e.g., 2K 1G)
|
||||
|
||||
|
||||
=item B<--key=KEYDEF> (not implemented)
|
||||
|
||||
=item B<-k>
|
||||
|
||||
sort via a key; KEYDEF gives location and type
|
||||
|
||||
|
||||
=item B<--numeric-sort>
|
||||
|
||||
=item B<-n>
|
||||
|
||||
compare according to string numerical value. If numerical values are
|
||||
the same: split the string into blocks of numbers and non-numbers, and
|
||||
compare numbers as numbers and strings as strings.
|
||||
|
||||
This will sort like this: chr3 chr11 3chr 11chr
|
||||
|
||||
|
||||
=item B<--numascii>
|
||||
|
||||
=item B<-N>
|
||||
|
||||
compare according to string numerical value. If numerical values are
|
||||
the same: compare as strings
|
||||
|
||||
|
||||
=item B<--random-sort>
|
||||
|
||||
=item B<-R>
|
||||
|
||||
sort by random hash of keys
|
||||
|
||||
|
||||
=item B<--reverse>
|
||||
|
||||
=item B<-r>
|
||||
|
||||
reverse the result of comparisons
|
||||
|
||||
|
||||
=item B<--sort=WORD> (not implemented)
|
||||
|
||||
sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month
|
||||
B<-M>, numeric B<-n>, random B<-R>, version B<-V>
|
||||
|
||||
|
||||
=item B<-t>
|
||||
|
||||
=item B<--field-separator=SEP>
|
||||
|
||||
use SEP instead of non-blank to blank transition
|
||||
|
||||
|
||||
=item B<-z>
|
||||
|
||||
=item B<--zero-terminated>
|
||||
|
||||
end lines with 0 byte, not newline
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
=head2 Single key
|
||||
|
||||
Input is sorted by Chromosome,Position:
|
||||
|
||||
SampleID Position Chromosome
|
||||
foo 10000123 chr3
|
||||
foo 10000125 chr3
|
||||
foo 9999998 chr11
|
||||
foo 10000124 chr11
|
||||
foo 10000126 chr11
|
||||
|
||||
To find all chr3:
|
||||
|
||||
2grep -n -k3 inputfile chr3
|
||||
|
||||
-n will split 'chr3' into 'chr' which is compared asciibetically and
|
||||
'3' which is compared numerically.
|
||||
|
||||
=head2 Not implemented
|
||||
|
||||
To find all lines with chr3,10000125:
|
||||
|
||||
2grep -k3n,2n inputfile chr3 10000125
|
||||
|
||||
|
||||
|
||||
=head1 REPORTING BUGS
|
||||
|
||||
B<2search> is part of tangetools. Report bugs to <tools@tange.dk>.
|
||||
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Copyright (C) 2016-2020 Ole Tange http://ole.tange.dk
|
||||
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
Copyright (C) 2013 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
at your option any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
=head2 Documentation license I
|
||||
|
||||
Permission is granted to copy, distribute and/or modify this documentation
|
||||
under the terms of the GNU Free Documentation License, Version 1.3 or
|
||||
any later version published by the Free Software Foundation; with no
|
||||
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
|
||||
Texts. A copy of the license is included in the file fdl.txt.
|
||||
|
||||
=head2 Documentation license II
|
||||
|
||||
You are free:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<to Share>
|
||||
|
||||
to copy, distribute and transmit the work
|
||||
|
||||
=item B<to Remix>
|
||||
|
||||
to adapt the work
|
||||
|
||||
=back
|
||||
|
||||
Under the following conditions:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Attribution>
|
||||
|
||||
You must attribute the work in the manner specified by the author or
|
||||
licensor (but not in any way that suggests that they endorse you or
|
||||
your use of the work).
|
||||
|
||||
=item B<Share Alike>
|
||||
|
||||
If you alter, transform, or build upon this work, you may distribute
|
||||
the resulting work only under the same, similar or a compatible
|
||||
license.
|
||||
|
||||
=back
|
||||
|
||||
With the understanding that:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Waiver>
|
||||
|
||||
Any of the above conditions can be waived if you get permission from
|
||||
the copyright holder.
|
||||
|
||||
=item B<Public Domain>
|
||||
|
||||
Where the work or any of its elements is in the public domain under
|
||||
applicable law, that status is in no way affected by the license.
|
||||
|
||||
=item B<Other Rights>
|
||||
|
||||
In no way are any of the following rights affected by the license:
|
||||
|
||||
=over 9
|
||||
|
||||
=item *
|
||||
|
||||
Your fair dealing or fair use rights, or other applicable
|
||||
copyright exceptions and limitations;
|
||||
|
||||
=item *
|
||||
|
||||
The author's moral rights;
|
||||
|
||||
=item *
|
||||
|
||||
Rights other persons may have either in the work itself or in
|
||||
how the work is used, such as publicity or privacy rights.
|
||||
|
||||
=back
|
||||
|
||||
=item B<Notice>
|
||||
|
||||
For any reuse or distribution, you must make clear to others the
|
||||
license terms of this work.
|
||||
|
||||
=back
|
||||
|
||||
A copy of the full license is included in the file as cc-by-sa.txt.
|
||||
|
||||
=head1 DEPENDENCIES
|
||||
|
||||
B<2search>/B<2grep> uses Perl.
|
||||
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
B<grep>(1), B<sort>(1).
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
Getopt::Long::Configure("bundling","require_order");
|
||||
|
||||
GetOptions(
|
||||
"debug|D" => \$opt::D,
|
||||
"version" => \$opt::version,
|
||||
"verbose|v" => \$opt::verbose,
|
||||
"B|byte-offset" => \$opt::byte_offset,
|
||||
"b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
|
||||
"d|dictionary-order" => \$opt::dictionary_order,
|
||||
"f|ignore-case" => \$opt::ignore_case,
|
||||
"g|general-numeric-sort" => \$opt::general_numeric_sort,
|
||||
"G|grep" => \$opt::grep,
|
||||
"F|file=s" => \$opt::file,
|
||||
"i|ignore-nonprinting" => \$opt::ignore_nonprinting,
|
||||
"M|month-sort" => \$opt::month_sort,
|
||||
"h|human-numeric-sort" => \$opt::human_numeric_sort,
|
||||
"n|numeric-sort" => \$opt::numeric_sort,
|
||||
"N|numascii" => \$opt::numascii,
|
||||
"r|reverse" => \$opt::reverse,
|
||||
"R|random-sort" => \$opt::random_sort,
|
||||
"sort=s" => \$opt::sort,
|
||||
"V|version-sort" => \$opt::version_sort,
|
||||
"k|key=s" => \@opt::key,
|
||||
"t|field-separator=s" => \$opt::field_separator,
|
||||
"z|zero-terminated" => \$opt::zero_terminated,
|
||||
);
|
||||
$Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
|
||||
$Global::version = 20200328;
|
||||
if($opt::version) { version(); exit 0; }
|
||||
if($opt::zero_terminated) { $/ = "\0"; }
|
||||
if(@opt::key) {
|
||||
# Default separator if --key = whitespace
|
||||
$Global::sep = '\s+';
|
||||
if(defined $opt::field_separator) { $Global::sep = $opt::field_separator; }
|
||||
}
|
||||
if($Global::progname eq "2grep") { $opt::grep = 1; }
|
||||
$Global::debug = $opt::D;
|
||||
|
||||
parse_keydef();
|
||||
|
||||
debug(my_dump(\@Global::keydefs),"\n");
|
||||
|
||||
my $file = shift;
|
||||
if(@ARGV) {
|
||||
$opt::argv = 1;
|
||||
} elsif(defined $opt::file) {
|
||||
# skip
|
||||
} else {
|
||||
$opt::stdin = 1;
|
||||
}
|
||||
|
||||
round:
|
||||
while(1) {
|
||||
my @search_vals;
|
||||
for(@Global::keydefs) {
|
||||
my $val = get();
|
||||
if(not defined $val) {
|
||||
last round;
|
||||
}
|
||||
push @search_vals, $val;
|
||||
}
|
||||
if($opt::grep) {
|
||||
bgrep($file,@search_vals);
|
||||
} else {
|
||||
print bsearch($file,@search_vals);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
my $fh;
|
||||
|
||||
sub get {
|
||||
if($opt::argv) {
|
||||
# Search for strings on the command line
|
||||
return shift @ARGV;
|
||||
}
|
||||
if($opt::file) {
|
||||
# Search for strings given with --file
|
||||
if(not $fh) {
|
||||
if(not open(my $fh, "<", $opt::file)) {
|
||||
error("Cannot open $opt::file");
|
||||
exit(255);
|
||||
}
|
||||
}
|
||||
my $val = <$fh>;
|
||||
chomp $val;
|
||||
return $val;
|
||||
}
|
||||
if($opt::stdin) {
|
||||
# Search for strings on stdin
|
||||
my $val = <>;
|
||||
chomp $val;
|
||||
return $val;
|
||||
}
|
||||
die;
|
||||
}
|
||||
}
|
||||
|
||||
sub bgrep {
|
||||
my $file = shift;
|
||||
my @search_vals = @_;
|
||||
$opt::byte_offset = 1;
|
||||
my $startpos = bsearch($file,@search_vals);
|
||||
my $fh;
|
||||
if(not open ($fh, "<", $file)) {
|
||||
error("Cannot open '$file'");
|
||||
exit 1;
|
||||
}
|
||||
seek($fh,$startpos,0) or die;
|
||||
# Allow for partial matches in grep (4 mathes 40, A matches Aaa)
|
||||
for my $keydef (@Global::keydefs) {
|
||||
$keydef->{'partial_match'} = 1;
|
||||
}
|
||||
my $line;
|
||||
while($line = <$fh>
|
||||
and
|
||||
not compare($line,@search_vals)) {
|
||||
print $line;
|
||||
}
|
||||
close $fh;
|
||||
for my $keydef (@Global::keydefs) {
|
||||
$keydef->{'partial_match'} = 0;
|
||||
}
|
||||
}
|
||||
|
||||
sub bsearch {
|
||||
my $file = shift;
|
||||
my @search_vals = @_;
|
||||
my $min = 0;
|
||||
my $max = -s $file;
|
||||
my $fh;
|
||||
if(not open ($fh, "<", $file)) {
|
||||
error("Cannot open '$file'");
|
||||
exit 1;
|
||||
}
|
||||
my($line,$middle);
|
||||
my $minnl = $min;
|
||||
my $maxnl = $max;
|
||||
while($max - $min > 1) {
|
||||
$middle = int(($max + $min)/2);
|
||||
seek($fh,$middle,0) or die("Cannot seek to $middle");
|
||||
if($middle > 0) {
|
||||
# Read last half of a line
|
||||
<$fh>;
|
||||
}
|
||||
my $newline_pos = tell($fh);
|
||||
debug("$min <= $middle <= $newline_pos <= $max\n");
|
||||
debug("$minnl <= $newline_pos <= $maxnl\n");
|
||||
if($newline_pos == $maxnl
|
||||
or
|
||||
eof($fh)
|
||||
or
|
||||
compare(($line = <$fh>),@search_vals) >= 0) {
|
||||
# We have see this newline position before
|
||||
# or we are at the end of the file
|
||||
# or we should search the upper half
|
||||
$max = $middle;
|
||||
$maxnl = $newline_pos;
|
||||
} else {
|
||||
# We should search the upper half
|
||||
$min = $middle;
|
||||
$minnl = $newline_pos;
|
||||
}
|
||||
}
|
||||
seek($fh,$minnl,0) or die("Cannot seek to $minnl");
|
||||
$line = <$fh>;
|
||||
if(compare($line,@search_vals) >= 0) {
|
||||
if($opt::byte_offset) {
|
||||
return $minnl."\n";
|
||||
} else {
|
||||
return $line;
|
||||
}
|
||||
} else {
|
||||
if($opt::byte_offset) {
|
||||
return tell($fh)."\n";
|
||||
} else {
|
||||
$line=<$fh>;
|
||||
return $line;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub parse_keydef {
|
||||
# parse keydef F[.C][OPTS][,F[.C][OPTS]]
|
||||
my %defaultorder = (
|
||||
"b" => $opt::ignore_leading_blanks,
|
||||
"d" => $opt::dictionary_order,
|
||||
"f" => $opt::ignore_case,
|
||||
"g" => $opt::general_numeric_sort,
|
||||
"i" => $opt::ignore_nonprinting,
|
||||
"M" => $opt::month_sort,
|
||||
"h" => $opt::human_numeric_sort,
|
||||
"n" => $opt::numeric_sort,
|
||||
"N" => $opt::numascii,
|
||||
"r" => $opt::reverse,
|
||||
"R" => $opt::random_sort,
|
||||
"V" => $opt::version_sort,
|
||||
);
|
||||
my %ordertbl = (
|
||||
"b" => 'ignore_leading_blanks',
|
||||
"d" => 'dictionary_order',
|
||||
"f" => 'ignore_case',
|
||||
"g" => 'general_numeric_sort',
|
||||
"i" => 'ignore_nonprinting',
|
||||
"M" => 'month_sort',
|
||||
"h" => 'human_numeric_sort',
|
||||
"n" => 'numeric_sort',
|
||||
"N" => 'numascii',
|
||||
"r" => 'reverse',
|
||||
"R" => 'random_sort',
|
||||
"V" => 'version_sort',
|
||||
);
|
||||
|
||||
if(@opt::key) {
|
||||
|
||||
} else {
|
||||
# Convert -n -r to -k1rn
|
||||
# with sep = undef
|
||||
$Global::sep = undef;
|
||||
my $opt;
|
||||
$opt->{'field'} = 1;
|
||||
$opt->{'char'} = 1;
|
||||
for (keys %defaultorder) {
|
||||
$opt->{$ordertbl{$_}} = $defaultorder{$_};
|
||||
}
|
||||
push(@Global::keydefs,$opt);
|
||||
}
|
||||
|
||||
for my $keydefs (@opt::key) {
|
||||
for my $keydef (split /,/, $keydefs) {
|
||||
my $opt;
|
||||
if($keydef =~ /^(\d+)(\.(\d+))?([bdfgiMhnNRrV]+)?$/) {
|
||||
# parse keydef F[.C][OPTS][,F[.C][OPTS]]
|
||||
$opt->{'field'} = $1;
|
||||
$opt->{'char'} = $3 || 1;
|
||||
for (keys %defaultorder) {
|
||||
$opt->{$ordertbl{$_}} = $defaultorder{$_};
|
||||
}
|
||||
for my $o (split //, $4) {
|
||||
$opt->{$ordertbl{$o}} = 1;
|
||||
}
|
||||
} else {
|
||||
error("Keydef $keydef does not match F[.C][OPTS]");
|
||||
exit(255);
|
||||
}
|
||||
push(@Global::keydefs,$opt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub compare {
|
||||
# One key to search for per search column
|
||||
my($line,@search_vals) = @_;
|
||||
chomp($line);
|
||||
debug("Compare: $line <=> @search_vals ");
|
||||
my @field;
|
||||
if($Global::sep) {
|
||||
# Split line
|
||||
@field = split /$Global::sep/o, $line;
|
||||
} else {
|
||||
@field = ($line);
|
||||
}
|
||||
my @tmp_vals = @search_vals;
|
||||
for my $keydef (@Global::keydefs) {
|
||||
# keydef = F[.C][OPTS][,F[.C][OPTS]]
|
||||
my $f = $keydef->{'field'};
|
||||
my $c = $keydef->{'char'};
|
||||
my $cmp = compare_single(substr($field[$f-1],$c-1),shift @tmp_vals,$keydef);
|
||||
# They differ on this key
|
||||
debug("== $cmp\n");
|
||||
if($cmp) { return $cmp; }
|
||||
}
|
||||
# No difference on any keydefs
|
||||
return 0;
|
||||
}
|
||||
|
||||
sub compare_single {
|
||||
# Compare two lines based on order options
|
||||
my ($a,$b,$opt) = @_;
|
||||
debug("$a <=> $b");
|
||||
debug(my_dump($opt),"\n");
|
||||
if($opt->{'random_sort'}) {
|
||||
return rand() <=> rand();
|
||||
}
|
||||
if($opt->{'ignore_leading_blanks'}) {
|
||||
$a =~ s/^\s+//;
|
||||
$b =~ s/^\s+//;
|
||||
}
|
||||
if($opt->{'ignore_case'}) {
|
||||
$a = uc($a);
|
||||
$b = uc($b);
|
||||
}
|
||||
if($opt->{'partial_match'}) {
|
||||
# String 'foo' matches 'foobar'
|
||||
$a = substr($a,0,length $b);
|
||||
}
|
||||
if($opt->{'reverse'}) {
|
||||
($a,$b) = ($b,$a);
|
||||
}
|
||||
if($opt->{'human_numeric_sort'}) {
|
||||
return multiply_binary_prefix($a) <=> multiply_binary_prefix($b);
|
||||
}
|
||||
if($opt->{'month_sort'}) {
|
||||
my %m;
|
||||
my @mon = qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC);
|
||||
@m{@mon}={1..12};
|
||||
return ($m{$a} || 0) <=> ($m{$b} || 0);
|
||||
}
|
||||
if($opt->{'numeric_sort'}) {
|
||||
return $a <=> $b;
|
||||
} elsif($opt->{'numascii'}) {
|
||||
return $a <=> $b or $a cmp $b;
|
||||
} else {
|
||||
return $a cmp $b;
|
||||
}
|
||||
}
|
||||
|
||||
sub multiply_binary_prefix(@) {
|
||||
# Evalualte numbers with binary prefix
|
||||
# Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80
|
||||
# ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80
|
||||
# K =2^10, M =2^20, G =2^30, T =2^40, P =2^50, E =2^70, Z =2^80, Y =2^80
|
||||
# k =10^3, m =10^6, g =10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24
|
||||
# 13G = 13*1024*1024*1024 = 13958643712
|
||||
# Input:
|
||||
# $s = string with prefixes
|
||||
# Returns:
|
||||
# $value = int with prefixes multiplied
|
||||
my @v = @_;
|
||||
for(@v) {
|
||||
# 1E3=1000, 1E-3=0.001
|
||||
s/e([+-]?\d+)/*10**$1/gi;
|
||||
}
|
||||
for(@v) {
|
||||
defined $_ or next;
|
||||
s/ki/*1024/gi;
|
||||
s/mi/*1024*1024/gi;
|
||||
s/gi/*1024*1024*1024/gi;
|
||||
s/ti/*1024*1024*1024*1024/gi;
|
||||
s/pi/*1024*1024*1024*1024*1024/gi;
|
||||
s/ei/*1024*1024*1024*1024*1024*1024/gi;
|
||||
s/zi/*1024*1024*1024*1024*1024*1024*1024/gi;
|
||||
s/yi/*1024*1024*1024*1024*1024*1024*1024*1024/gi;
|
||||
s/xi/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi;
|
||||
|
||||
s/K/*1024/g;
|
||||
s/M/*1024*1024/g;
|
||||
s/G/*1024*1024*1024/g;
|
||||
s/T/*1024*1024*1024*1024/g;
|
||||
s/P/*1024*1024*1024*1024*1024/g;
|
||||
s/E/*1024*1024*1024*1024*1024*1024/g;
|
||||
s/Z/*1024*1024*1024*1024*1024*1024*1024/g;
|
||||
s/Y/*1024*1024*1024*1024*1024*1024*1024*1024/g;
|
||||
s/X/*1024*1024*1024*1024*1024*1024*1024*1024*1024/g;
|
||||
|
||||
s/k/*1000/g;
|
||||
s/m/*1000*1000/g;
|
||||
s/g/*1000*1000*1000/g;
|
||||
s/t/*1000*1000*1000*1000/g;
|
||||
s/p/*1000*1000*1000*1000*1000/g;
|
||||
s/e/*1000*1000*1000*1000*1000*1000/g;
|
||||
s/z/*1000*1000*1000*1000*1000*1000*1000/g;
|
||||
s/y/*1000*1000*1000*1000*1000*1000*1000*1000/g;
|
||||
s/x/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g;
|
||||
|
||||
$_ = eval $_;
|
||||
}
|
||||
return wantarray ? @v : $v[0];
|
||||
}
|
||||
|
||||
sub status {
|
||||
my @w = @_;
|
||||
my $fh = $Global::status_fd || *STDERR;
|
||||
print $fh map { ($_, "\n") } @w;
|
||||
flush $fh;
|
||||
}
|
||||
|
||||
sub status_no_nl {
|
||||
my @w = @_;
|
||||
my $fh = $Global::status_fd || *STDERR;
|
||||
print $fh @w;
|
||||
flush $fh;
|
||||
}
|
||||
|
||||
sub warning {
|
||||
my @w = @_;
|
||||
my $prog = $Global::progname || "parallel";
|
||||
status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
|
||||
}
|
||||
|
||||
sub error {
|
||||
my @w = @_;
|
||||
my $prog = $Global::progname || "parallel";
|
||||
status(map { ($prog.": Error: ". $_); } @w);
|
||||
}
|
||||
|
||||
sub die_bug {
|
||||
my $bugid = shift;
|
||||
print STDERR
|
||||
("$Global::progname: This should not happen. You have found a bug.\n",
|
||||
"Please submit a bug at https://gitlab.com/ole.tange/tangetools/-/issues\n",
|
||||
"and include:\n",
|
||||
"* The version number: $Global::version\n",
|
||||
"* The bugid: $bugid\n",
|
||||
"* The command line being run\n",
|
||||
"* The files being read (put the files on a webserver if they are big)\n",
|
||||
"\n",
|
||||
"If you get the error on smaller/fewer files, please include those instead.\n");
|
||||
exit(255);
|
||||
}
|
||||
|
||||
sub version {
|
||||
# Returns: N/A
|
||||
print join("\n",
|
||||
"$Global::progname $Global::version",
|
||||
"Copyright (C) 2016-2020",
|
||||
"Ole Tange and Free Software Foundation, Inc.",
|
||||
"License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
|
||||
"This is free software: you are free to change and redistribute it.",
|
||||
"$Global::progname comes with no warranty.",
|
||||
"",
|
||||
"Web site: https://gitlab.com/ole.tange/tangetools/\n",
|
||||
);
|
||||
}
|
||||
|
||||
sub my_dump(@) {
|
||||
# Returns:
|
||||
# ascii expression of object if Data::Dump(er) is installed
|
||||
# error code otherwise
|
||||
my @dump_this = (@_);
|
||||
eval "use Data::Dump qw(dump);";
|
||||
if ($@) {
|
||||
# Data::Dump not installed
|
||||
eval "use Data::Dumper;";
|
||||
if ($@) {
|
||||
my $err = "Neither Data::Dump nor Data::Dumper is installed\n".
|
||||
"Not dumping output\n";
|
||||
::status($err);
|
||||
return $err;
|
||||
} else {
|
||||
return Dumper(@dump_this);
|
||||
}
|
||||
} else {
|
||||
# Create a dummy Data::Dump:dump as Hans Schou sometimes has
|
||||
# it undefined
|
||||
eval "sub Data::Dump:dump {}";
|
||||
eval "use Data::Dump qw(dump);";
|
||||
return (Data::Dump::dump(@dump_this));
|
||||
}
|
||||
}
|
||||
|
||||
sub debug(@) {
|
||||
# Returns: N/A
|
||||
$Global::debug or return;
|
||||
print @_;
|
||||
}
|
194
2search/regressiontest
Executable file
194
2search/regressiontest
Executable file
|
@ -0,0 +1,194 @@
|
|||
#!/bin/bash
|
||||
|
||||
test_tmp=`tempfile`
|
||||
export test_tmp
|
||||
|
||||
opt_tester() {
|
||||
opt="$@"
|
||||
tmp=$(tempfile)
|
||||
test_2search() {
|
||||
xargs echo Search in < $tmp
|
||||
2search $opt $tmp 0 2 2.1 100000
|
||||
2search $opt -B $tmp 0 2 2.1 100000
|
||||
}
|
||||
(true) |
|
||||
sort $opt > $tmp
|
||||
echo Search in null file
|
||||
test_2search
|
||||
|
||||
(echo) |
|
||||
sort $opt > $tmp
|
||||
echo Search in newline
|
||||
test_2search
|
||||
|
||||
(echo 1.000000000) |
|
||||
sort $opt > $tmp
|
||||
test_2search
|
||||
|
||||
(echo 1.000000000;
|
||||
echo 2) |
|
||||
sort $opt > $tmp
|
||||
test_2search
|
||||
|
||||
(echo 1;
|
||||
echo 2.000000000) |
|
||||
sort $opt > $tmp
|
||||
test_2search
|
||||
|
||||
(echo 1.000000000;
|
||||
echo 2;
|
||||
echo 3) |
|
||||
sort $opt > $tmp
|
||||
test_2search
|
||||
|
||||
(echo 1;
|
||||
echo 2.000000000;
|
||||
echo 3) |
|
||||
sort $opt > $tmp
|
||||
test_2search
|
||||
|
||||
(echo 1;
|
||||
echo 2;
|
||||
echo 3.000000000) |
|
||||
sort $opt > $tmp
|
||||
test_2search
|
||||
|
||||
rm $tmp
|
||||
}
|
||||
export -f opt_tester
|
||||
|
||||
test_n() {
|
||||
tmp=${test_tmp}_n
|
||||
true > $tmp
|
||||
echo Search in null file
|
||||
2search -n $tmp 0 2 2.1 100000
|
||||
2search -nB $tmp 0 2 2.1 100000
|
||||
echo > $tmp
|
||||
xargs echo Search in newline
|
||||
2search -n $tmp 0 2 2.1 100000
|
||||
2search -nB $tmp 0 2 2.1 100000
|
||||
echo 1.000000000 > $tmp
|
||||
xargs echo Search in < $tmp
|
||||
2search -n $tmp 0 2 2.1 100000
|
||||
2search -nB $tmp 0 2 2.1 100000
|
||||
echo 1.000000000 > $tmp
|
||||
echo 2 >> $tmp
|
||||
xargs echo Search in < $tmp
|
||||
2search -n $tmp 0 2 2.1 100000
|
||||
2search -nB $tmp 0 2 2.1 100000
|
||||
echo 1 > $tmp
|
||||
echo 2.000000000 >> $tmp
|
||||
xargs echo Search in < $tmp
|
||||
2search -n $tmp 0 2 2.1 100000
|
||||
2search -nB $tmp 0 2 2.1 100000
|
||||
echo 1.000000000 > $tmp
|
||||
echo 2 >> $tmp
|
||||
echo 3 >> $tmp
|
||||
xargs echo Search in < $tmp
|
||||
2search -n $tmp 0 2 2.1 100000
|
||||
2search -nB $tmp 0 2 2.1 100000
|
||||
echo 1 > $tmp
|
||||
echo 2.000000000 >> $tmp
|
||||
echo 3 >> $tmp
|
||||
xargs echo Search in < $tmp
|
||||
2search -n $tmp 0 2 2.1 100000
|
||||
2search -nB $tmp 0 2 2.1 100000
|
||||
echo 1 > $tmp
|
||||
echo 2 >> $tmp
|
||||
echo 3.000000000 >> $tmp
|
||||
xargs echo Search in < $tmp
|
||||
2search -n $tmp 0 2 2.1 100000
|
||||
2search -nB $tmp 0 2 2.1 100000
|
||||
rm $tmp
|
||||
}
|
||||
|
||||
test_n_opt() {
|
||||
opt_tester -n
|
||||
}
|
||||
|
||||
test_rn_opt() {
|
||||
opt_tester -rn
|
||||
}
|
||||
|
||||
test_r_opt() {
|
||||
opt_tester -rn
|
||||
}
|
||||
|
||||
test_k32_2n_1n() {
|
||||
tmp=$(tempfile)
|
||||
cat >$tmp <<EOF
|
||||
1 chr1 Sample 1
|
||||
11 chr1 Sample 1
|
||||
111 chr1 Sample 1
|
||||
1111 chr1 Sample 1
|
||||
11111 chr1 Sample 1
|
||||
111111 chr1 Sample 1
|
||||
1 chr2 Sample 1
|
||||
22 chr2 Sample 1
|
||||
111 chr2 Sample 1
|
||||
2222 chr2 Sample 1
|
||||
11111 chr2 Sample 1
|
||||
111111 chr2 Sample 1
|
||||
1 chr10 Sample 1
|
||||
11 chr10 Sample 1
|
||||
111 chr10 Sample 1
|
||||
1111 chr10 Sample 1
|
||||
11111 chr10 Sample 1
|
||||
111111 chr10 Sample 1
|
||||
1 chr1 Sample 2
|
||||
11 chr1 Sample 2
|
||||
111 chr1 Sample 2
|
||||
1111 chr1 Sample 2
|
||||
11111 chr1 Sample 2
|
||||
111111 chr1 Sample 2
|
||||
1 chr2 Sample 2
|
||||
22 chr2 Sample 2
|
||||
111 chr2 Sample 2
|
||||
2222 chr2 Sample 2
|
||||
11111 chr2 Sample 2
|
||||
111111 chr2 Sample 2
|
||||
1 chr10 Sample 2
|
||||
11 chr10 Sample 2
|
||||
111 chr10 Sample 2
|
||||
1111 chr10 Sample 2
|
||||
11111 chr10 Sample 2
|
||||
111111 chr10 Sample 2
|
||||
1 chr1 Sample 10
|
||||
11 chr1 Sample 10
|
||||
111 chr1 Sample 10
|
||||
1111 chr1 Sample 10
|
||||
11111 chr1 Sample 10
|
||||
111111 chr1 Sample 10
|
||||
1 chr2 Sample 10
|
||||
22 chr2 Sample 10
|
||||
111 chr2 Sample 10
|
||||
2222 chr2 Sample 10
|
||||
11111 chr2 Sample 10
|
||||
111111 chr2 Sample 10
|
||||
1 chr10 Sample 10
|
||||
11 chr10 Sample 10
|
||||
111 chr10 Sample 10
|
||||
1111 chr10 Sample 10
|
||||
11111 chr10 Sample 10
|
||||
111111 chr10 Sample 10
|
||||
EOF
|
||||
2grep -k3N,2N,1n $tmp 'Sample 10' chr10 111
|
||||
echo $tmp
|
||||
}
|
||||
|
||||
test_partial_line() {
|
||||
tmp=$(tempfile)
|
||||
seq 100 | LC_ALL=C sort > $tmp
|
||||
echo '### 2search --grep'
|
||||
2search --grep $tmp 3
|
||||
echo '### 2grep'
|
||||
2grep $tmp 3
|
||||
echo '### ... | 2grep'
|
||||
echo 3 | 2grep $tmp
|
||||
rm $tmp
|
||||
}
|
||||
|
||||
|
||||
export -f $(compgen -A function | grep test_)
|
||||
compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1' > regressiontest.new
|
||||
diff regressiontest.new regressiontest.out
|
280
2search/regressiontest.out
Normal file
280
2search/regressiontest.out
Normal file
|
@ -0,0 +1,280 @@
|
|||
test_k32_2n_1n 111 chr10 Sample 10
|
||||
test_k32_2n_1n 1111 chr10 Sample 10
|
||||
test_k32_2n_1n 11111 chr10 Sample 10
|
||||
test_k32_2n_1n 111111 chr10 Sample 10
|
||||
test_n Search in null file
|
||||
test_n 0
|
||||
test_n 0
|
||||
test_n 0
|
||||
test_n 0
|
||||
test_n Search in newline
|
||||
test_n
|
||||
test_n 0
|
||||
test_n 1
|
||||
test_n 1
|
||||
test_n 1
|
||||
test_n Search in 1.000000000
|
||||
test_n 1.000000000
|
||||
test_n 0
|
||||
test_n 12
|
||||
test_n 12
|
||||
test_n 12
|
||||
test_n Search in 1.000000000 2
|
||||
test_n 1.000000000
|
||||
test_n 2
|
||||
test_n 0
|
||||
test_n 12
|
||||
test_n 14
|
||||
test_n 14
|
||||
test_n Search in 1 2.000000000
|
||||
test_n 1
|
||||
test_n 2.000000000
|
||||
test_n 0
|
||||
test_n 2
|
||||
test_n 14
|
||||
test_n 14
|
||||
test_n Search in 1.000000000 2 3
|
||||
test_n 1.000000000
|
||||
test_n 2
|
||||
test_n 3
|
||||
test_n 0
|
||||
test_n 12
|
||||
test_n 14
|
||||
test_n 16
|
||||
test_n Search in 1 2.000000000 3
|
||||
test_n 1
|
||||
test_n 2.000000000
|
||||
test_n 3
|
||||
test_n 0
|
||||
test_n 2
|
||||
test_n 14
|
||||
test_n 16
|
||||
test_n Search in 1 2 3.000000000
|
||||
test_n 1
|
||||
test_n 2
|
||||
test_n 3.000000000
|
||||
test_n 0
|
||||
test_n 2
|
||||
test_n 4
|
||||
test_n 16
|
||||
test_n_opt Search in null file
|
||||
test_n_opt Search in
|
||||
test_n_opt 0
|
||||
test_n_opt 0
|
||||
test_n_opt 0
|
||||
test_n_opt 0
|
||||
test_n_opt Search in newline
|
||||
test_n_opt Search in
|
||||
test_n_opt
|
||||
test_n_opt 0
|
||||
test_n_opt 1
|
||||
test_n_opt 1
|
||||
test_n_opt 1
|
||||
test_n_opt Search in 1.000000000
|
||||
test_n_opt 1.000000000
|
||||
test_n_opt 0
|
||||
test_n_opt 12
|
||||
test_n_opt 12
|
||||
test_n_opt 12
|
||||
test_n_opt Search in 1.000000000 2
|
||||
test_n_opt 1.000000000
|
||||
test_n_opt 2
|
||||
test_n_opt 0
|
||||
test_n_opt 12
|
||||
test_n_opt 14
|
||||
test_n_opt 14
|
||||
test_n_opt Search in 1 2.000000000
|
||||
test_n_opt 1
|
||||
test_n_opt 2.000000000
|
||||
test_n_opt 0
|
||||
test_n_opt 2
|
||||
test_n_opt 14
|
||||
test_n_opt 14
|
||||
test_n_opt Search in 1.000000000 2 3
|
||||
test_n_opt 1.000000000
|
||||
test_n_opt 2
|
||||
test_n_opt 3
|
||||
test_n_opt 0
|
||||
test_n_opt 12
|
||||
test_n_opt 14
|
||||
test_n_opt 16
|
||||
test_n_opt Search in 1 2.000000000 3
|
||||
test_n_opt 1
|
||||
test_n_opt 2.000000000
|
||||
test_n_opt 3
|
||||
test_n_opt 0
|
||||
test_n_opt 2
|
||||
test_n_opt 14
|
||||
test_n_opt 16
|
||||
test_n_opt Search in 1 2 3.000000000
|
||||
test_n_opt 1
|
||||
test_n_opt 2
|
||||
test_n_opt 3.000000000
|
||||
test_n_opt 0
|
||||
test_n_opt 2
|
||||
test_n_opt 4
|
||||
test_n_opt 16
|
||||
test_partial_line ### 2search --grep
|
||||
test_partial_line 3
|
||||
test_partial_line 30
|
||||
test_partial_line 31
|
||||
test_partial_line 32
|
||||
test_partial_line 33
|
||||
test_partial_line 34
|
||||
test_partial_line 35
|
||||
test_partial_line 36
|
||||
test_partial_line 37
|
||||
test_partial_line 38
|
||||
test_partial_line 39
|
||||
test_partial_line ### 2grep
|
||||
test_partial_line 3
|
||||
test_partial_line 30
|
||||
test_partial_line 31
|
||||
test_partial_line 32
|
||||
test_partial_line 33
|
||||
test_partial_line 34
|
||||
test_partial_line 35
|
||||
test_partial_line 36
|
||||
test_partial_line 37
|
||||
test_partial_line 38
|
||||
test_partial_line 39
|
||||
test_partial_line ### ... | 2grep
|
||||
test_partial_line 3
|
||||
test_partial_line 30
|
||||
test_partial_line 31
|
||||
test_partial_line 32
|
||||
test_partial_line 33
|
||||
test_partial_line 34
|
||||
test_partial_line 35
|
||||
test_partial_line 36
|
||||
test_partial_line 37
|
||||
test_partial_line 38
|
||||
test_partial_line 39
|
||||
test_rn_opt Search in null file
|
||||
test_rn_opt Search in
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt Search in newline
|
||||
test_rn_opt Search in
|
||||
test_rn_opt
|
||||
test_rn_opt
|
||||
test_rn_opt
|
||||
test_rn_opt
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt Search in 1.000000000
|
||||
test_rn_opt 1.000000000
|
||||
test_rn_opt 1.000000000
|
||||
test_rn_opt 1.000000000
|
||||
test_rn_opt 12
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt Search in 2 1.000000000
|
||||
test_rn_opt 2
|
||||
test_rn_opt 2
|
||||
test_rn_opt 2
|
||||
test_rn_opt 14
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt Search in 2.000000000 1
|
||||
test_rn_opt 2.000000000
|
||||
test_rn_opt 2.000000000
|
||||
test_rn_opt 2.000000000
|
||||
test_rn_opt 14
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt 0
|
||||
test_rn_opt Search in 3 2 1.000000000
|
||||
test_rn_opt 2
|
||||
test_rn_opt 2
|
||||
test_rn_opt 3
|
||||
test_rn_opt 16
|
||||
test_rn_opt 2
|
||||
test_rn_opt 2
|
||||
test_rn_opt 0
|
||||
test_rn_opt Search in 3 2.000000000 1
|
||||
test_rn_opt 2.000000000
|
||||
test_rn_opt 2.000000000
|
||||
test_rn_opt 3
|
||||
test_rn_opt 16
|
||||
test_rn_opt 2
|
||||
test_rn_opt 2
|
||||
test_rn_opt 0
|
||||
test_rn_opt Search in 3.000000000 2 1
|
||||
test_rn_opt 2
|
||||
test_rn_opt 2
|
||||
test_rn_opt 3.000000000
|
||||
test_rn_opt 16
|
||||
test_rn_opt 12
|
||||
test_rn_opt 12
|
||||
test_rn_opt 0
|
||||
test_r_opt Search in null file
|
||||
test_r_opt Search in
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt Search in newline
|
||||
test_r_opt Search in
|
||||
test_r_opt
|
||||
test_r_opt
|
||||
test_r_opt
|
||||
test_r_opt
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt Search in 1.000000000
|
||||
test_r_opt 1.000000000
|
||||
test_r_opt 1.000000000
|
||||
test_r_opt 1.000000000
|
||||
test_r_opt 12
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt Search in 2 1.000000000
|
||||
test_r_opt 2
|
||||
test_r_opt 2
|
||||
test_r_opt 2
|
||||
test_r_opt 14
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt Search in 2.000000000 1
|
||||
test_r_opt 2.000000000
|
||||
test_r_opt 2.000000000
|
||||
test_r_opt 2.000000000
|
||||
test_r_opt 14
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt 0
|
||||
test_r_opt Search in 3 2 1.000000000
|
||||
test_r_opt 2
|
||||
test_r_opt 2
|
||||
test_r_opt 3
|
||||
test_r_opt 16
|
||||
test_r_opt 2
|
||||
test_r_opt 2
|
||||
test_r_opt 0
|
||||
test_r_opt Search in 3 2.000000000 1
|
||||
test_r_opt 2.000000000
|
||||
test_r_opt 2.000000000
|
||||
test_r_opt 3
|
||||
test_r_opt 16
|
||||
test_r_opt 2
|
||||
test_r_opt 2
|
||||
test_r_opt 0
|
||||
test_r_opt Search in 3.000000000 2 1
|
||||
test_r_opt 2
|
||||
test_r_opt 2
|
||||
test_r_opt 3.000000000
|
||||
test_r_opt 16
|
||||
test_r_opt 12
|
||||
test_r_opt 12
|
||||
test_r_opt 0
|
25
Makefile
25
Makefile
|
@ -1,21 +1,22 @@
|
|||
CMD = blink bsearch burncpu duplicate-packets em encdir field forever \
|
||||
CMD = blink 2grep 2search burncpu duplicate-packets em encdir field forever \
|
||||
fxkill G gitnext gitundo goodpasswd histogram mtrr mirrorpdf \
|
||||
neno off pdfman pidcmd plotpipe puniq ramusage rand rclean \
|
||||
rina rn rrm seekmaniac shython sound-reload splitvideo stdout \
|
||||
swapout T timestamp tracefile transpose upsidedown vid \
|
||||
w4it-for-port-open whitehash wifi-reload wssh ytv yyyymmdd
|
||||
|
||||
all: blink/blink.1 bsearch/bsearch.1 burncpu/burncpu.1 \
|
||||
encdir/encdir.1 G/G.1 gitnext/gitnext.1 gitundo/gitundo.1 \
|
||||
goodpasswd/goodpasswd.1 histogram/histogram.1 \
|
||||
mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1 \
|
||||
pidcmd/pidcmd.1 plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 \
|
||||
rina/rina.1 rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 \
|
||||
shython/shython.1 sound-reload/sound-reload.1 \
|
||||
splitvideo/splitvideo.1 stdout/stdout.1 timestamp/timestamp.1 \
|
||||
tracefile/tracefile.1 transpose/transpose.1 T/T.1 \
|
||||
upsidedown/upsidedown.1 vid/vid.1 wifi-reload/wifi-reload.1 \
|
||||
wssh/wssh.1 ytv/ytv.1 yyyymmdd/yyyymmdd.1
|
||||
all: blink/blink.1 2search/2grep.1 2search/2search.1 \
|
||||
burncpu/burncpu.1 encdir/encdir.1 G/G.1 gitnext/gitnext.1 \
|
||||
gitundo/gitundo.1 goodpasswd/goodpasswd.1 \
|
||||
histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \
|
||||
off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 plotpipe/plotpipe.1 \
|
||||
puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 \
|
||||
seekmaniac/seekmaniac.1 shython/shython.1 \
|
||||
sound-reload/sound-reload.1 splitvideo/splitvideo.1 \
|
||||
stdout/stdout.1 timestamp/timestamp.1 tracefile/tracefile.1 \
|
||||
transpose/transpose.1 T/T.1 upsidedown/upsidedown.1 vid/vid.1 \
|
||||
wifi-reload/wifi-reload.1 wssh/wssh.1 ytv/ytv.1 \
|
||||
yyyymmdd/yyyymmdd.1
|
||||
|
||||
%.1: %
|
||||
pod2man $< > $@
|
||||
|
|
404
bsearch/bsearch
404
bsearch/bsearch
|
@ -1,404 +0,0 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
=head1 NAME
|
||||
|
||||
bsearch - binary search through sorted text files
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
B<bsearch> [-nrfB] file string [string...]
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
B<bsearch> searches a sorted file for a string. It outputs the
|
||||
following line or the byte position of this line, which is where the
|
||||
string would have been if it had been in the sorted file.
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<--ignore-leading-blanks> (not implemented)
|
||||
|
||||
=item B<-b>
|
||||
|
||||
ignore leading blanks
|
||||
|
||||
=item B<--byte-offset>
|
||||
|
||||
=item B<-B>
|
||||
|
||||
print byte position where string would have been
|
||||
|
||||
=item B<--dictionary-order> (not implemented)
|
||||
|
||||
=item B<-d>
|
||||
|
||||
consider only blanks and alphanumeric characters
|
||||
|
||||
=item B<--debug> (not implemented)
|
||||
|
||||
=item B<-D>
|
||||
|
||||
annotate the part of the line used to sort, and warn about
|
||||
questionable usage to stderr
|
||||
|
||||
=item B<--ignore-case>
|
||||
|
||||
=item B<-f>
|
||||
|
||||
fold lower case to upper case characters
|
||||
|
||||
=item B<--general-numeric-sort> (not implemented)
|
||||
|
||||
=item B<-g>
|
||||
|
||||
compare according to general numerical value
|
||||
|
||||
=item B<--ignore-nonprinting> (not implemented)
|
||||
|
||||
=item B<-i>
|
||||
|
||||
consider only printable characters
|
||||
|
||||
=item B<--month-sort> (not implemented)
|
||||
|
||||
=item B<-M>
|
||||
|
||||
compare (unknown) < 'JAN' < ... < 'DEC'
|
||||
|
||||
=item B<--human-numeric-sort> (not implemented)
|
||||
|
||||
=item B<-h>
|
||||
|
||||
compare human readable numbers (e.g., 2K 1G)
|
||||
|
||||
=item B<--key=KEYDEF> (not implemented)
|
||||
|
||||
=item B<-k>
|
||||
|
||||
sort via a key; KEYDEF gives location and type
|
||||
|
||||
=item B<--numeric-sort>
|
||||
|
||||
=item B<-n>
|
||||
|
||||
compare according to string numerical value
|
||||
|
||||
=item B<--random-sort>
|
||||
|
||||
=item B<-R>
|
||||
|
||||
sort by random hash of keys
|
||||
|
||||
=item B<--reverse>
|
||||
|
||||
=item B<-r>
|
||||
|
||||
reverse the result of comparisons
|
||||
|
||||
=item B<--sort=WORD> (not implemented)
|
||||
|
||||
sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month
|
||||
B<-M>, numeric B<-n>, random B<-R>, version B<-V>
|
||||
|
||||
=item B<-t> (not implemented)
|
||||
|
||||
=item B<--field-separator=SEP>
|
||||
|
||||
use SEP instead of non-blank to blank transition
|
||||
|
||||
=item B<-z>
|
||||
|
||||
=item B<--zero-terminated>
|
||||
|
||||
end lines with 0 byte, not newline
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
=head2 Missing
|
||||
|
||||
Missing
|
||||
|
||||
|
||||
=head1 REPORTING BUGS
|
||||
|
||||
B<bsearch> is part of tangetools. Report bugs to <tools@tange.dk>.
|
||||
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Copyright (C) 2016 Ole Tange http://ole.tange.dk
|
||||
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
Copyright (C) 2013 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
at your option any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
=head2 Documentation license I
|
||||
|
||||
Permission is granted to copy, distribute and/or modify this documentation
|
||||
under the terms of the GNU Free Documentation License, Version 1.3 or
|
||||
any later version published by the Free Software Foundation; with no
|
||||
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
|
||||
Texts. A copy of the license is included in the file fdl.txt.
|
||||
|
||||
=head2 Documentation license II
|
||||
|
||||
You are free:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<to Share>
|
||||
|
||||
to copy, distribute and transmit the work
|
||||
|
||||
=item B<to Remix>
|
||||
|
||||
to adapt the work
|
||||
|
||||
=back
|
||||
|
||||
Under the following conditions:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Attribution>
|
||||
|
||||
You must attribute the work in the manner specified by the author or
|
||||
licensor (but not in any way that suggests that they endorse you or
|
||||
your use of the work).
|
||||
|
||||
=item B<Share Alike>
|
||||
|
||||
If you alter, transform, or build upon this work, you may distribute
|
||||
the resulting work only under the same, similar or a compatible
|
||||
license.
|
||||
|
||||
=back
|
||||
|
||||
With the understanding that:
|
||||
|
||||
=over 9
|
||||
|
||||
=item B<Waiver>
|
||||
|
||||
Any of the above conditions can be waived if you get permission from
|
||||
the copyright holder.
|
||||
|
||||
=item B<Public Domain>
|
||||
|
||||
Where the work or any of its elements is in the public domain under
|
||||
applicable law, that status is in no way affected by the license.
|
||||
|
||||
=item B<Other Rights>
|
||||
|
||||
In no way are any of the following rights affected by the license:
|
||||
|
||||
=over 9
|
||||
|
||||
=item *
|
||||
|
||||
Your fair dealing or fair use rights, or other applicable
|
||||
copyright exceptions and limitations;
|
||||
|
||||
=item *
|
||||
|
||||
The author's moral rights;
|
||||
|
||||
=item *
|
||||
|
||||
Rights other persons may have either in the work itself or in
|
||||
how the work is used, such as publicity or privacy rights.
|
||||
|
||||
=back
|
||||
|
||||
=item B<Notice>
|
||||
|
||||
For any reuse or distribution, you must make clear to others the
|
||||
license terms of this work.
|
||||
|
||||
=back
|
||||
|
||||
A copy of the full license is included in the file as cc-by-sa.txt.
|
||||
|
||||
=head1 DEPENDENCIES
|
||||
|
||||
B<bsearch> uses Perl.
|
||||
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
B<grep>(1), B<sort>(1).
|
||||
|
||||
=cut
|
||||
|
||||
use Getopt::Long;
|
||||
|
||||
Getopt::Long::Configure("bundling","require_order");
|
||||
|
||||
GetOptions(
|
||||
"debug|D=s" => \$opt::D,
|
||||
"version" => \$opt::version,
|
||||
"verbose|v" => \$opt::verbose,
|
||||
"B|byte-offset" => \$opt::byte_offset,
|
||||
"b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
|
||||
"d|dictionary-order" => \$opt::dictionary_order,
|
||||
"f|ignore-case" => \$opt::ignore_case,
|
||||
"g|general-numeric-sort" => \$opt::general_numeric_sort,
|
||||
"i|ignore-nonprinting" => \$opt::ignore_nonprinting,
|
||||
"M|month-sort" => \$opt::month_sort,
|
||||
"h|human-numeric-sort" => \$opt::human_numeric_sort,
|
||||
"n|numeric-sort" => \$opt::numeric_sort,
|
||||
"r|reverse" => \$opt::reverse,
|
||||
"R|random-sort" => \$opt::random_sort,
|
||||
"sort=s" => \$opt::sort,
|
||||
"V|version-sort" => \$opt::version_sort,
|
||||
"k|key=s" => \@opt::key,
|
||||
"t|field-separator=s" => \$opt::field_separator,
|
||||
"z|zero-terminated" => \$opt::zero_terminated,
|
||||
);
|
||||
$Global::progname = "bsearch";
|
||||
$Global::version = 20160712;
|
||||
if($opt::version) {
|
||||
version();
|
||||
exit 0;
|
||||
}
|
||||
if($opt::zero_terminated) { $/ = "\0"; }
|
||||
|
||||
my $file = shift;
|
||||
|
||||
for my $key (@ARGV) {
|
||||
print bsearch($file,$key);
|
||||
}
|
||||
|
||||
sub bsearch {
|
||||
my $file = shift;
|
||||
my $key = shift;
|
||||
my $min = 0;
|
||||
my $max = -s $file;
|
||||
|
||||
if(not open ($fh, "<", $file)) {
|
||||
error("Cannot open '$file'");
|
||||
exit 1;
|
||||
}
|
||||
my $line;
|
||||
while($max - $min > 1) {
|
||||
$middle = int(($max + $min)/2);
|
||||
seek($fh,$middle,0) or die;
|
||||
my $half = <$fh>;
|
||||
if(eof($fh)
|
||||
or
|
||||
compare(($line = <$fh>),$key) >= 0) {
|
||||
$max = $middle;
|
||||
} else {
|
||||
$min = $middle;
|
||||
}
|
||||
}
|
||||
seek($fh,$max,0) or die;
|
||||
$line = <$fh>;
|
||||
if(compare($line,$key) >= 0) {
|
||||
if($opt::byte_offset) {
|
||||
return "0\n";
|
||||
} else {
|
||||
# The very first line
|
||||
return "";
|
||||
}
|
||||
} else {
|
||||
if($opt::byte_offset) {
|
||||
return tell($fh)."\n";
|
||||
} else {
|
||||
return $line;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub compare {
|
||||
my ($a,$b) = @_;
|
||||
if($opt::random_sort) {
|
||||
return rand() <=> rand();
|
||||
}
|
||||
if($opt::reverse) {
|
||||
($a,$b) = ($b,$a);
|
||||
}
|
||||
if($opt::ignore_case) {
|
||||
$a = uc($a);
|
||||
$b = uc($b);
|
||||
}
|
||||
if($opt::numeric_sort) {
|
||||
return $a <=> $b;
|
||||
} elsif($opt::numascii) {
|
||||
return $a <=> $b or $a cmp $b;
|
||||
} else {
|
||||
return $a cmp $b;
|
||||
}
|
||||
}
|
||||
|
||||
sub status {
|
||||
my @w = @_;
|
||||
my $fh = $Global::status_fd || *STDERR;
|
||||
print $fh map { ($_, "\n") } @w;
|
||||
flush $fh;
|
||||
}
|
||||
|
||||
sub status_no_nl {
|
||||
my @w = @_;
|
||||
my $fh = $Global::status_fd || *STDERR;
|
||||
print $fh @w;
|
||||
flush $fh;
|
||||
}
|
||||
|
||||
sub warning {
|
||||
my @w = @_;
|
||||
my $prog = $Global::progname || "parallel";
|
||||
status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
|
||||
}
|
||||
|
||||
sub error {
|
||||
my @w = @_;
|
||||
my $prog = $Global::progname || "parallel";
|
||||
status(map { ($prog.": Error: ". $_); } @w);
|
||||
}
|
||||
|
||||
sub die_bug {
|
||||
my $bugid = shift;
|
||||
print STDERR
|
||||
("$Global::progname: This should not happen. You have found a bug.\n",
|
||||
"Please contact <parallel\@gnu.org> and include:\n",
|
||||
"* The version number: $Global::version\n",
|
||||
"* The bugid: $bugid\n",
|
||||
"* The command line being run\n",
|
||||
"* The files being read (put the files on a webserver if they are big)\n",
|
||||
"\n",
|
||||
"If you get the error on smaller/fewer files, please include those instead.\n");
|
||||
::wait_and_exit(255);
|
||||
}
|
||||
|
||||
sub version {
|
||||
# Returns: N/A
|
||||
print join("\n",
|
||||
"GNU $Global::progname $Global::version",
|
||||
"Copyright (C) 2016",
|
||||
"Ole Tange and Free Software Foundation, Inc.",
|
||||
"License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
|
||||
"This is free software: you are free to change and redistribute it.",
|
||||
"GNU $Global::progname comes with no warranty.",
|
||||
"",
|
||||
"Web site: http://www.gnu.org/software/${Global::progname}\n",
|
||||
"When using programs that use GNU Parallel to process data for publication",
|
||||
"please cite as described in 'parallel --citation'.\n",
|
||||
);
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
test_tmp=`tempfile`
|
||||
export test_tmp
|
||||
|
||||
test_n() {
|
||||
tmp=${test_tmp}_n
|
||||
true > $tmp
|
||||
bsearch -n $tmp 0 2 2.1 100000
|
||||
echo > $tmp
|
||||
xargs < $tmp
|
||||
bsearch -n $tmp 0 2 2.1 100000
|
||||
echo 1.000000000 > $tmp
|
||||
xargs < $tmp
|
||||
bsearch -n $tmp 0 2 2.1 100000
|
||||
echo 1.000000000 > $tmp
|
||||
echo 2 >> $tmp
|
||||
xargs < $tmp
|
||||
bsearch -n $tmp 0 2 2.1 100000
|
||||
echo 1 > $tmp
|
||||
echo 2.000000000 >> $tmp
|
||||
xargs < $tmp
|
||||
bsearch -n $tmp 0 2 2.1 100000
|
||||
echo 1.000000000 > $tmp
|
||||
echo 2 >> $tmp
|
||||
echo 3 >> $tmp
|
||||
xargs < $tmp
|
||||
bsearch -n $tmp 0 2 2.1 100000
|
||||
echo 1 > $tmp
|
||||
echo 2.000000000 >> $tmp
|
||||
echo 3 >> $tmp
|
||||
xargs < $tmp
|
||||
bsearch -n $tmp 0 2 2.1 100000
|
||||
echo 1 > $tmp
|
||||
echo 2 >> $tmp
|
||||
echo 3.000000000 >> $tmp
|
||||
xargs < $tmp
|
||||
bsearch -n $tmp 0 2 2.1 100000
|
||||
rm $tmp
|
||||
}
|
||||
|
||||
|
||||
export -f $(compgen -A function | grep test_)
|
||||
compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1'
|
Loading…
Reference in a new issue