2search: bsearch/bgrep renamed to 2search/2grep (bgrep is used by others).

This commit is contained in:
Ole Tange 2020-03-28 15:37:52 +01:00
parent 9efd18d0fc
commit e8f520f642
7 changed files with 2041 additions and 460 deletions

777
2search/2grep Executable file
View file

@ -0,0 +1,777 @@
#!/usr/bin/perl
=head1 NAME
2search - binary search through sorted text files
2grep - binary search+grep through sorted text files
=head1 SYNOPSIS
B<2search> [-nrfB] file string [string...]
B<2search> --grep [-nrf] file string [string...]
B<2grep> [-nrf] file string [string...]
... | B<2search> [-nrfB] file
... | B<2search> --grep [-nrf] file
... | B<2grep> [-nrf] file
=head1 DESCRIPTION
B<2search> searches a sorted file for a string. It outputs the
following line or the byte position of this line, which is where the
string would have been if it had been in the sorted file.
B<2grep> output all lines starting with a given string. The file must
be sorted.
=over 9
=item B<--ignore-leading-blanks>
=item B<-b>
ignore leading blanks
=item B<--byte-offset>
=item B<-B>
print byte position where string would have been
=item B<--dictionary-order> (not implemented)
=item B<-d>
consider only blanks and alphanumeric characters
=item B<--debug> (not implemented)
=item B<-D>
annotate the part of the line used to sort, and warn about
questionable usage to stderr
=item B<--ignore-case>
=item B<-f>
fold lower case to upper case characters
=item B<--file> I<file>
=item B<-F> I<file>
search for all lines in I<file>
=item B<--general-numeric-sort> (not implemented)
=item B<-g>
compare according to general numerical value
=item B<--ignore-nonprinting> (not implemented)
=item B<-i>
consider only printable characters
=item B<--month-sort>
=item B<-M>
compare (unknown) < 'JAN' < ... < 'DEC'
=item B<--human-numeric-sort>
=item B<-h>
compare human readable numbers (e.g., 2K 1G)
=item B<--key=KEYDEF> (not implemented)
=item B<-k>
sort via a key; KEYDEF gives location and type
=item B<--numeric-sort>
=item B<-n>
compare according to string numerical value. If numerical values are
the same: split the string into blocks of numbers and non-numbers, and
compare numbers as numbers and strings as strings.
This will sort like this: chr3 chr11 3chr 11chr
=item B<--numascii>
=item B<-N>
compare according to string numerical value. If numerical values are
the same: compare as strings
=item B<--random-sort>
=item B<-R>
sort by random hash of keys
=item B<--reverse>
=item B<-r>
reverse the result of comparisons
=item B<--sort=WORD> (not implemented)
sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month
B<-M>, numeric B<-n>, random B<-R>, version B<-V>
=item B<-t>
=item B<--field-separator=SEP>
use SEP instead of non-blank to blank transition
=item B<-z>
=item B<--zero-terminated>
end lines with 0 byte, not newline
=back
=head1 EXAMPLES
=head2 Single key
Input is sorted by Chromosome,Position:
SampleID Position Chromosome
foo 10000123 chr3
foo 10000125 chr3
foo 9999998 chr11
foo 10000124 chr11
foo 10000126 chr11
To find all chr3:
2grep -n -k3 inputfile chr3
-n will split 'chr3' into 'chr' which is compared asciibetically and
'3' which is compared numerically.
=head2 Not implemented
To find all lines with chr3,10000125:
2grep -k3n,2n inputfile chr3 10000125
=head1 REPORTING BUGS
B<2search> is part of tangetools. Report bugs to <tools@tange.dk>.
=head1 AUTHOR
Copyright (C) 2016-2020 Ole Tange http://ole.tange.dk
=head1 LICENSE
Copyright (C) 2013 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head2 Documentation license I
Permission is granted to copy, distribute and/or modify this documentation
under the terms of the GNU Free Documentation License, Version 1.3 or
any later version published by the Free Software Foundation; with no
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
Texts. A copy of the license is included in the file fdl.txt.
=head2 Documentation license II
You are free:
=over 9
=item B<to Share>
to copy, distribute and transmit the work
=item B<to Remix>
to adapt the work
=back
Under the following conditions:
=over 9
=item B<Attribution>
You must attribute the work in the manner specified by the author or
licensor (but not in any way that suggests that they endorse you or
your use of the work).
=item B<Share Alike>
If you alter, transform, or build upon this work, you may distribute
the resulting work only under the same, similar or a compatible
license.
=back
With the understanding that:
=over 9
=item B<Waiver>
Any of the above conditions can be waived if you get permission from
the copyright holder.
=item B<Public Domain>
Where the work or any of its elements is in the public domain under
applicable law, that status is in no way affected by the license.
=item B<Other Rights>
In no way are any of the following rights affected by the license:
=over 9
=item *
Your fair dealing or fair use rights, or other applicable
copyright exceptions and limitations;
=item *
The author's moral rights;
=item *
Rights other persons may have either in the work itself or in
how the work is used, such as publicity or privacy rights.
=back
=item B<Notice>
For any reuse or distribution, you must make clear to others the
license terms of this work.
=back
A copy of the full license is included in the file as cc-by-sa.txt.
=head1 DEPENDENCIES
B<2search>/B<2grep> uses Perl.
=head1 SEE ALSO
B<grep>(1), B<sort>(1).
=cut
use strict;
use Getopt::Long;
Getopt::Long::Configure("bundling","require_order");
GetOptions(
"debug|D" => \$opt::D,
"version" => \$opt::version,
"verbose|v" => \$opt::verbose,
"B|byte-offset" => \$opt::byte_offset,
"b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
"d|dictionary-order" => \$opt::dictionary_order,
"f|ignore-case" => \$opt::ignore_case,
"g|general-numeric-sort" => \$opt::general_numeric_sort,
"G|grep" => \$opt::grep,
"F|file=s" => \$opt::file,
"i|ignore-nonprinting" => \$opt::ignore_nonprinting,
"M|month-sort" => \$opt::month_sort,
"h|human-numeric-sort" => \$opt::human_numeric_sort,
"n|numeric-sort" => \$opt::numeric_sort,
"N|numascii" => \$opt::numascii,
"r|reverse" => \$opt::reverse,
"R|random-sort" => \$opt::random_sort,
"sort=s" => \$opt::sort,
"V|version-sort" => \$opt::version_sort,
"k|key=s" => \@opt::key,
"t|field-separator=s" => \$opt::field_separator,
"z|zero-terminated" => \$opt::zero_terminated,
);
$Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
$Global::version = 20200328;
if($opt::version) { version(); exit 0; }
if($opt::zero_terminated) { $/ = "\0"; }
if(@opt::key) {
# Default separator if --key = whitespace
$Global::sep = '\s+';
if(defined $opt::field_separator) { $Global::sep = $opt::field_separator; }
}
if($Global::progname eq "2grep") { $opt::grep = 1; }
$Global::debug = $opt::D;
parse_keydef();
debug(my_dump(\@Global::keydefs),"\n");
my $file = shift;
if(@ARGV) {
$opt::argv = 1;
} elsif(defined $opt::file) {
# skip
} else {
$opt::stdin = 1;
}
round:
while(1) {
my @search_vals;
for(@Global::keydefs) {
my $val = get();
if(not defined $val) {
last round;
}
push @search_vals, $val;
}
if($opt::grep) {
bgrep($file,@search_vals);
} else {
print bsearch($file,@search_vals);
}
}
{
my $fh;
sub get {
if($opt::argv) {
# Search for strings on the command line
return shift @ARGV;
}
if($opt::file) {
# Search for strings given with --file
if(not $fh) {
if(not open(my $fh, "<", $opt::file)) {
error("Cannot open $opt::file");
exit(255);
}
}
my $val = <$fh>;
chomp $val;
return $val;
}
if($opt::stdin) {
# Search for strings on stdin
my $val = <>;
chomp $val;
return $val;
}
die;
}
}
sub bgrep {
my $file = shift;
my @search_vals = @_;
$opt::byte_offset = 1;
my $startpos = bsearch($file,@search_vals);
my $fh;
if(not open ($fh, "<", $file)) {
error("Cannot open '$file'");
exit 1;
}
seek($fh,$startpos,0) or die;
# Allow for partial matches in grep (4 mathes 40, A matches Aaa)
for my $keydef (@Global::keydefs) {
$keydef->{'partial_match'} = 1;
}
my $line;
while($line = <$fh>
and
not compare($line,@search_vals)) {
print $line;
}
close $fh;
for my $keydef (@Global::keydefs) {
$keydef->{'partial_match'} = 0;
}
}
sub bsearch {
my $file = shift;
my @search_vals = @_;
my $min = 0;
my $max = -s $file;
my $fh;
if(not open ($fh, "<", $file)) {
error("Cannot open '$file'");
exit 1;
}
my($line,$middle);
my $minnl = $min;
my $maxnl = $max;
while($max - $min > 1) {
$middle = int(($max + $min)/2);
seek($fh,$middle,0) or die("Cannot seek to $middle");
if($middle > 0) {
# Read last half of a line
<$fh>;
}
my $newline_pos = tell($fh);
debug("$min <= $middle <= $newline_pos <= $max\n");
debug("$minnl <= $newline_pos <= $maxnl\n");
if($newline_pos == $maxnl
or
eof($fh)
or
compare(($line = <$fh>),@search_vals) >= 0) {
# We have see this newline position before
# or we are at the end of the file
# or we should search the upper half
$max = $middle;
$maxnl = $newline_pos;
} else {
# We should search the upper half
$min = $middle;
$minnl = $newline_pos;
}
}
seek($fh,$minnl,0) or die("Cannot seek to $minnl");
$line = <$fh>;
if(compare($line,@search_vals) >= 0) {
if($opt::byte_offset) {
return $minnl."\n";
} else {
return $line;
}
} else {
if($opt::byte_offset) {
return tell($fh)."\n";
} else {
$line=<$fh>;
return $line;
}
}
}
sub parse_keydef {
# parse keydef F[.C][OPTS][,F[.C][OPTS]]
my %defaultorder = (
"b" => $opt::ignore_leading_blanks,
"d" => $opt::dictionary_order,
"f" => $opt::ignore_case,
"g" => $opt::general_numeric_sort,
"i" => $opt::ignore_nonprinting,
"M" => $opt::month_sort,
"h" => $opt::human_numeric_sort,
"n" => $opt::numeric_sort,
"N" => $opt::numascii,
"r" => $opt::reverse,
"R" => $opt::random_sort,
"V" => $opt::version_sort,
);
my %ordertbl = (
"b" => 'ignore_leading_blanks',
"d" => 'dictionary_order',
"f" => 'ignore_case',
"g" => 'general_numeric_sort',
"i" => 'ignore_nonprinting',
"M" => 'month_sort',
"h" => 'human_numeric_sort',
"n" => 'numeric_sort',
"N" => 'numascii',
"r" => 'reverse',
"R" => 'random_sort',
"V" => 'version_sort',
);
if(@opt::key) {
} else {
# Convert -n -r to -k1rn
# with sep = undef
$Global::sep = undef;
my $opt;
$opt->{'field'} = 1;
$opt->{'char'} = 1;
for (keys %defaultorder) {
$opt->{$ordertbl{$_}} = $defaultorder{$_};
}
push(@Global::keydefs,$opt);
}
for my $keydefs (@opt::key) {
for my $keydef (split /,/, $keydefs) {
my $opt;
if($keydef =~ /^(\d+)(\.(\d+))?([bdfgiMhnNRrV]+)?$/) {
# parse keydef F[.C][OPTS][,F[.C][OPTS]]
$opt->{'field'} = $1;
$opt->{'char'} = $3 || 1;
for (keys %defaultorder) {
$opt->{$ordertbl{$_}} = $defaultorder{$_};
}
for my $o (split //, $4) {
$opt->{$ordertbl{$o}} = 1;
}
} else {
error("Keydef $keydef does not match F[.C][OPTS]");
exit(255);
}
push(@Global::keydefs,$opt);
}
}
}
sub compare {
# One key to search for per search column
my($line,@search_vals) = @_;
chomp($line);
debug("Compare: $line <=> @search_vals ");
my @field;
if($Global::sep) {
# Split line
@field = split /$Global::sep/o, $line;
} else {
@field = ($line);
}
my @tmp_vals = @search_vals;
for my $keydef (@Global::keydefs) {
# keydef = F[.C][OPTS][,F[.C][OPTS]]
my $f = $keydef->{'field'};
my $c = $keydef->{'char'};
my $cmp = compare_single(substr($field[$f-1],$c-1),shift @tmp_vals,$keydef);
# They differ on this key
debug("== $cmp\n");
if($cmp) { return $cmp; }
}
# No difference on any keydefs
return 0;
}
sub compare_single {
# Compare two lines based on order options
my ($a,$b,$opt) = @_;
debug("$a <=> $b");
debug(my_dump($opt),"\n");
if($opt->{'random_sort'}) {
return rand() <=> rand();
}
if($opt->{'ignore_leading_blanks'}) {
$a =~ s/^\s+//;
$b =~ s/^\s+//;
}
if($opt->{'ignore_case'}) {
$a = uc($a);
$b = uc($b);
}
if($opt->{'partial_match'}) {
# String 'foo' matches 'foobar'
$a = substr($a,0,length $b);
}
if($opt->{'reverse'}) {
($a,$b) = ($b,$a);
}
if($opt->{'human_numeric_sort'}) {
return multiply_binary_prefix($a) <=> multiply_binary_prefix($b);
}
if($opt->{'month_sort'}) {
my %m;
my @mon = qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC);
@m{@mon}={1..12};
return ($m{$a} || 0) <=> ($m{$b} || 0);
}
if($opt->{'numeric_sort'}) {
return $a <=> $b;
} elsif($opt->{'numascii'}) {
return $a <=> $b or $a cmp $b;
} else {
return $a cmp $b;
}
}
sub multiply_binary_prefix(@) {
# Evalualte numbers with binary prefix
# Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80
# ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80
# K =2^10, M =2^20, G =2^30, T =2^40, P =2^50, E =2^70, Z =2^80, Y =2^80
# k =10^3, m =10^6, g =10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24
# 13G = 13*1024*1024*1024 = 13958643712
# Input:
# $s = string with prefixes
# Returns:
# $value = int with prefixes multiplied
my @v = @_;
for(@v) {
# 1E3=1000, 1E-3=0.001
s/e([+-]?\d+)/*10**$1/gi;
}
for(@v) {
defined $_ or next;
s/ki/*1024/gi;
s/mi/*1024*1024/gi;
s/gi/*1024*1024*1024/gi;
s/ti/*1024*1024*1024*1024/gi;
s/pi/*1024*1024*1024*1024*1024/gi;
s/ei/*1024*1024*1024*1024*1024*1024/gi;
s/zi/*1024*1024*1024*1024*1024*1024*1024/gi;
s/yi/*1024*1024*1024*1024*1024*1024*1024*1024/gi;
s/xi/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi;
s/K/*1024/g;
s/M/*1024*1024/g;
s/G/*1024*1024*1024/g;
s/T/*1024*1024*1024*1024/g;
s/P/*1024*1024*1024*1024*1024/g;
s/E/*1024*1024*1024*1024*1024*1024/g;
s/Z/*1024*1024*1024*1024*1024*1024*1024/g;
s/Y/*1024*1024*1024*1024*1024*1024*1024*1024/g;
s/X/*1024*1024*1024*1024*1024*1024*1024*1024*1024/g;
s/k/*1000/g;
s/m/*1000*1000/g;
s/g/*1000*1000*1000/g;
s/t/*1000*1000*1000*1000/g;
s/p/*1000*1000*1000*1000*1000/g;
s/e/*1000*1000*1000*1000*1000*1000/g;
s/z/*1000*1000*1000*1000*1000*1000*1000/g;
s/y/*1000*1000*1000*1000*1000*1000*1000*1000/g;
s/x/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g;
$_ = eval $_;
}
return wantarray ? @v : $v[0];
}
sub status {
my @w = @_;
my $fh = $Global::status_fd || *STDERR;
print $fh map { ($_, "\n") } @w;
flush $fh;
}
sub status_no_nl {
my @w = @_;
my $fh = $Global::status_fd || *STDERR;
print $fh @w;
flush $fh;
}
sub warning {
my @w = @_;
my $prog = $Global::progname || "parallel";
status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
}
sub error {
my @w = @_;
my $prog = $Global::progname || "parallel";
status(map { ($prog.": Error: ". $_); } @w);
}
sub die_bug {
my $bugid = shift;
print STDERR
("$Global::progname: This should not happen. You have found a bug.\n",
"Please submit a bug at https://gitlab.com/ole.tange/tangetools/-/issues\n",
"and include:\n",
"* The version number: $Global::version\n",
"* The bugid: $bugid\n",
"* The command line being run\n",
"* The files being read (put the files on a webserver if they are big)\n",
"\n",
"If you get the error on smaller/fewer files, please include those instead.\n");
exit(255);
}
sub version {
# Returns: N/A
print join("\n",
"$Global::progname $Global::version",
"Copyright (C) 2016-2020",
"Ole Tange and Free Software Foundation, Inc.",
"License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
"This is free software: you are free to change and redistribute it.",
"$Global::progname comes with no warranty.",
"",
"Web site: https://gitlab.com/ole.tange/tangetools/\n",
);
}
sub my_dump(@) {
# Returns:
# ascii expression of object if Data::Dump(er) is installed
# error code otherwise
my @dump_this = (@_);
eval "use Data::Dump qw(dump);";
if ($@) {
# Data::Dump not installed
eval "use Data::Dumper;";
if ($@) {
my $err = "Neither Data::Dump nor Data::Dumper is installed\n".
"Not dumping output\n";
::status($err);
return $err;
} else {
return Dumper(@dump_this);
}
} else {
# Create a dummy Data::Dump:dump as Hans Schou sometimes has
# it undefined
eval "sub Data::Dump:dump {}";
eval "use Data::Dump qw(dump);";
return (Data::Dump::dump(@dump_this));
}
}
sub debug(@) {
# Returns: N/A
$Global::debug or return;
print @_;
}

777
2search/2search Executable file
View file

@ -0,0 +1,777 @@
#!/usr/bin/perl
=head1 NAME
2search - binary search through sorted text files
2grep - binary search+grep through sorted text files
=head1 SYNOPSIS
B<2search> [-nrfB] file string [string...]
B<2search> --grep [-nrf] file string [string...]
B<2grep> [-nrf] file string [string...]
... | B<2search> [-nrfB] file
... | B<2search> --grep [-nrf] file
... | B<2grep> [-nrf] file
=head1 DESCRIPTION
B<2search> searches a sorted file for a string. It outputs the
following line or the byte position of this line, which is where the
string would have been if it had been in the sorted file.
B<2grep> output all lines starting with a given string. The file must
be sorted.
=over 9
=item B<--ignore-leading-blanks>
=item B<-b>
ignore leading blanks
=item B<--byte-offset>
=item B<-B>
print byte position where string would have been
=item B<--dictionary-order> (not implemented)
=item B<-d>
consider only blanks and alphanumeric characters
=item B<--debug> (not implemented)
=item B<-D>
annotate the part of the line used to sort, and warn about
questionable usage to stderr
=item B<--ignore-case>
=item B<-f>
fold lower case to upper case characters
=item B<--file> I<file>
=item B<-F> I<file>
search for all lines in I<file>
=item B<--general-numeric-sort> (not implemented)
=item B<-g>
compare according to general numerical value
=item B<--ignore-nonprinting> (not implemented)
=item B<-i>
consider only printable characters
=item B<--month-sort>
=item B<-M>
compare (unknown) < 'JAN' < ... < 'DEC'
=item B<--human-numeric-sort>
=item B<-h>
compare human readable numbers (e.g., 2K 1G)
=item B<--key=KEYDEF> (not implemented)
=item B<-k>
sort via a key; KEYDEF gives location and type
=item B<--numeric-sort>
=item B<-n>
compare according to string numerical value. If numerical values are
the same: split the string into blocks of numbers and non-numbers, and
compare numbers as numbers and strings as strings.
This will sort like this: chr3 chr11 3chr 11chr
=item B<--numascii>
=item B<-N>
compare according to string numerical value. If numerical values are
the same: compare as strings
=item B<--random-sort>
=item B<-R>
sort by random hash of keys
=item B<--reverse>
=item B<-r>
reverse the result of comparisons
=item B<--sort=WORD> (not implemented)
sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month
B<-M>, numeric B<-n>, random B<-R>, version B<-V>
=item B<-t>
=item B<--field-separator=SEP>
use SEP instead of non-blank to blank transition
=item B<-z>
=item B<--zero-terminated>
end lines with 0 byte, not newline
=back
=head1 EXAMPLES
=head2 Single key
Input is sorted by Chromosome,Position:
SampleID Position Chromosome
foo 10000123 chr3
foo 10000125 chr3
foo 9999998 chr11
foo 10000124 chr11
foo 10000126 chr11
To find all chr3:
2grep -n -k3 inputfile chr3
-n will split 'chr3' into 'chr' which is compared asciibetically and
'3' which is compared numerically.
=head2 Not implemented
To find all lines with chr3,10000125:
2grep -k3n,2n inputfile chr3 10000125
=head1 REPORTING BUGS
B<2search> is part of tangetools. Report bugs to <tools@tange.dk>.
=head1 AUTHOR
Copyright (C) 2016-2020 Ole Tange http://ole.tange.dk
=head1 LICENSE
Copyright (C) 2013 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head2 Documentation license I
Permission is granted to copy, distribute and/or modify this documentation
under the terms of the GNU Free Documentation License, Version 1.3 or
any later version published by the Free Software Foundation; with no
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
Texts. A copy of the license is included in the file fdl.txt.
=head2 Documentation license II
You are free:
=over 9
=item B<to Share>
to copy, distribute and transmit the work
=item B<to Remix>
to adapt the work
=back
Under the following conditions:
=over 9
=item B<Attribution>
You must attribute the work in the manner specified by the author or
licensor (but not in any way that suggests that they endorse you or
your use of the work).
=item B<Share Alike>
If you alter, transform, or build upon this work, you may distribute
the resulting work only under the same, similar or a compatible
license.
=back
With the understanding that:
=over 9
=item B<Waiver>
Any of the above conditions can be waived if you get permission from
the copyright holder.
=item B<Public Domain>
Where the work or any of its elements is in the public domain under
applicable law, that status is in no way affected by the license.
=item B<Other Rights>
In no way are any of the following rights affected by the license:
=over 9
=item *
Your fair dealing or fair use rights, or other applicable
copyright exceptions and limitations;
=item *
The author's moral rights;
=item *
Rights other persons may have either in the work itself or in
how the work is used, such as publicity or privacy rights.
=back
=item B<Notice>
For any reuse or distribution, you must make clear to others the
license terms of this work.
=back
A copy of the full license is included in the file as cc-by-sa.txt.
=head1 DEPENDENCIES
B<2search>/B<2grep> uses Perl.
=head1 SEE ALSO
B<grep>(1), B<sort>(1).
=cut
use strict;
use Getopt::Long;
Getopt::Long::Configure("bundling","require_order");
GetOptions(
"debug|D" => \$opt::D,
"version" => \$opt::version,
"verbose|v" => \$opt::verbose,
"B|byte-offset" => \$opt::byte_offset,
"b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
"d|dictionary-order" => \$opt::dictionary_order,
"f|ignore-case" => \$opt::ignore_case,
"g|general-numeric-sort" => \$opt::general_numeric_sort,
"G|grep" => \$opt::grep,
"F|file=s" => \$opt::file,
"i|ignore-nonprinting" => \$opt::ignore_nonprinting,
"M|month-sort" => \$opt::month_sort,
"h|human-numeric-sort" => \$opt::human_numeric_sort,
"n|numeric-sort" => \$opt::numeric_sort,
"N|numascii" => \$opt::numascii,
"r|reverse" => \$opt::reverse,
"R|random-sort" => \$opt::random_sort,
"sort=s" => \$opt::sort,
"V|version-sort" => \$opt::version_sort,
"k|key=s" => \@opt::key,
"t|field-separator=s" => \$opt::field_separator,
"z|zero-terminated" => \$opt::zero_terminated,
);
$Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
$Global::version = 20200328;
if($opt::version) { version(); exit 0; }
if($opt::zero_terminated) { $/ = "\0"; }
if(@opt::key) {
# Default separator if --key = whitespace
$Global::sep = '\s+';
if(defined $opt::field_separator) { $Global::sep = $opt::field_separator; }
}
if($Global::progname eq "2grep") { $opt::grep = 1; }
$Global::debug = $opt::D;
parse_keydef();
debug(my_dump(\@Global::keydefs),"\n");
my $file = shift;
if(@ARGV) {
$opt::argv = 1;
} elsif(defined $opt::file) {
# skip
} else {
$opt::stdin = 1;
}
round:
while(1) {
my @search_vals;
for(@Global::keydefs) {
my $val = get();
if(not defined $val) {
last round;
}
push @search_vals, $val;
}
if($opt::grep) {
bgrep($file,@search_vals);
} else {
print bsearch($file,@search_vals);
}
}
{
my $fh;
sub get {
if($opt::argv) {
# Search for strings on the command line
return shift @ARGV;
}
if($opt::file) {
# Search for strings given with --file
if(not $fh) {
if(not open(my $fh, "<", $opt::file)) {
error("Cannot open $opt::file");
exit(255);
}
}
my $val = <$fh>;
chomp $val;
return $val;
}
if($opt::stdin) {
# Search for strings on stdin
my $val = <>;
chomp $val;
return $val;
}
die;
}
}
sub bgrep {
my $file = shift;
my @search_vals = @_;
$opt::byte_offset = 1;
my $startpos = bsearch($file,@search_vals);
my $fh;
if(not open ($fh, "<", $file)) {
error("Cannot open '$file'");
exit 1;
}
seek($fh,$startpos,0) or die;
# Allow for partial matches in grep (4 mathes 40, A matches Aaa)
for my $keydef (@Global::keydefs) {
$keydef->{'partial_match'} = 1;
}
my $line;
while($line = <$fh>
and
not compare($line,@search_vals)) {
print $line;
}
close $fh;
for my $keydef (@Global::keydefs) {
$keydef->{'partial_match'} = 0;
}
}
sub bsearch {
my $file = shift;
my @search_vals = @_;
my $min = 0;
my $max = -s $file;
my $fh;
if(not open ($fh, "<", $file)) {
error("Cannot open '$file'");
exit 1;
}
my($line,$middle);
my $minnl = $min;
my $maxnl = $max;
while($max - $min > 1) {
$middle = int(($max + $min)/2);
seek($fh,$middle,0) or die("Cannot seek to $middle");
if($middle > 0) {
# Read last half of a line
<$fh>;
}
my $newline_pos = tell($fh);
debug("$min <= $middle <= $newline_pos <= $max\n");
debug("$minnl <= $newline_pos <= $maxnl\n");
if($newline_pos == $maxnl
or
eof($fh)
or
compare(($line = <$fh>),@search_vals) >= 0) {
# We have see this newline position before
# or we are at the end of the file
# or we should search the upper half
$max = $middle;
$maxnl = $newline_pos;
} else {
# We should search the upper half
$min = $middle;
$minnl = $newline_pos;
}
}
seek($fh,$minnl,0) or die("Cannot seek to $minnl");
$line = <$fh>;
if(compare($line,@search_vals) >= 0) {
if($opt::byte_offset) {
return $minnl."\n";
} else {
return $line;
}
} else {
if($opt::byte_offset) {
return tell($fh)."\n";
} else {
$line=<$fh>;
return $line;
}
}
}
sub parse_keydef {
# parse keydef F[.C][OPTS][,F[.C][OPTS]]
my %defaultorder = (
"b" => $opt::ignore_leading_blanks,
"d" => $opt::dictionary_order,
"f" => $opt::ignore_case,
"g" => $opt::general_numeric_sort,
"i" => $opt::ignore_nonprinting,
"M" => $opt::month_sort,
"h" => $opt::human_numeric_sort,
"n" => $opt::numeric_sort,
"N" => $opt::numascii,
"r" => $opt::reverse,
"R" => $opt::random_sort,
"V" => $opt::version_sort,
);
my %ordertbl = (
"b" => 'ignore_leading_blanks',
"d" => 'dictionary_order',
"f" => 'ignore_case',
"g" => 'general_numeric_sort',
"i" => 'ignore_nonprinting',
"M" => 'month_sort',
"h" => 'human_numeric_sort',
"n" => 'numeric_sort',
"N" => 'numascii',
"r" => 'reverse',
"R" => 'random_sort',
"V" => 'version_sort',
);
if(@opt::key) {
} else {
# Convert -n -r to -k1rn
# with sep = undef
$Global::sep = undef;
my $opt;
$opt->{'field'} = 1;
$opt->{'char'} = 1;
for (keys %defaultorder) {
$opt->{$ordertbl{$_}} = $defaultorder{$_};
}
push(@Global::keydefs,$opt);
}
for my $keydefs (@opt::key) {
for my $keydef (split /,/, $keydefs) {
my $opt;
if($keydef =~ /^(\d+)(\.(\d+))?([bdfgiMhnNRrV]+)?$/) {
# parse keydef F[.C][OPTS][,F[.C][OPTS]]
$opt->{'field'} = $1;
$opt->{'char'} = $3 || 1;
for (keys %defaultorder) {
$opt->{$ordertbl{$_}} = $defaultorder{$_};
}
for my $o (split //, $4) {
$opt->{$ordertbl{$o}} = 1;
}
} else {
error("Keydef $keydef does not match F[.C][OPTS]");
exit(255);
}
push(@Global::keydefs,$opt);
}
}
}
sub compare {
# One key to search for per search column
my($line,@search_vals) = @_;
chomp($line);
debug("Compare: $line <=> @search_vals ");
my @field;
if($Global::sep) {
# Split line
@field = split /$Global::sep/o, $line;
} else {
@field = ($line);
}
my @tmp_vals = @search_vals;
for my $keydef (@Global::keydefs) {
# keydef = F[.C][OPTS][,F[.C][OPTS]]
my $f = $keydef->{'field'};
my $c = $keydef->{'char'};
my $cmp = compare_single(substr($field[$f-1],$c-1),shift @tmp_vals,$keydef);
# They differ on this key
debug("== $cmp\n");
if($cmp) { return $cmp; }
}
# No difference on any keydefs
return 0;
}
sub compare_single {
# Compare two lines based on order options
my ($a,$b,$opt) = @_;
debug("$a <=> $b");
debug(my_dump($opt),"\n");
if($opt->{'random_sort'}) {
return rand() <=> rand();
}
if($opt->{'ignore_leading_blanks'}) {
$a =~ s/^\s+//;
$b =~ s/^\s+//;
}
if($opt->{'ignore_case'}) {
$a = uc($a);
$b = uc($b);
}
if($opt->{'partial_match'}) {
# String 'foo' matches 'foobar'
$a = substr($a,0,length $b);
}
if($opt->{'reverse'}) {
($a,$b) = ($b,$a);
}
if($opt->{'human_numeric_sort'}) {
return multiply_binary_prefix($a) <=> multiply_binary_prefix($b);
}
if($opt->{'month_sort'}) {
my %m;
my @mon = qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC);
@m{@mon}={1..12};
return ($m{$a} || 0) <=> ($m{$b} || 0);
}
if($opt->{'numeric_sort'}) {
return $a <=> $b;
} elsif($opt->{'numascii'}) {
return $a <=> $b or $a cmp $b;
} else {
return $a cmp $b;
}
}
sub multiply_binary_prefix(@) {
# Evalualte numbers with binary prefix
# Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80
# ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80
# K =2^10, M =2^20, G =2^30, T =2^40, P =2^50, E =2^70, Z =2^80, Y =2^80
# k =10^3, m =10^6, g =10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24
# 13G = 13*1024*1024*1024 = 13958643712
# Input:
# $s = string with prefixes
# Returns:
# $value = int with prefixes multiplied
my @v = @_;
for(@v) {
# 1E3=1000, 1E-3=0.001
s/e([+-]?\d+)/*10**$1/gi;
}
for(@v) {
defined $_ or next;
s/ki/*1024/gi;
s/mi/*1024*1024/gi;
s/gi/*1024*1024*1024/gi;
s/ti/*1024*1024*1024*1024/gi;
s/pi/*1024*1024*1024*1024*1024/gi;
s/ei/*1024*1024*1024*1024*1024*1024/gi;
s/zi/*1024*1024*1024*1024*1024*1024*1024/gi;
s/yi/*1024*1024*1024*1024*1024*1024*1024*1024/gi;
s/xi/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi;
s/K/*1024/g;
s/M/*1024*1024/g;
s/G/*1024*1024*1024/g;
s/T/*1024*1024*1024*1024/g;
s/P/*1024*1024*1024*1024*1024/g;
s/E/*1024*1024*1024*1024*1024*1024/g;
s/Z/*1024*1024*1024*1024*1024*1024*1024/g;
s/Y/*1024*1024*1024*1024*1024*1024*1024*1024/g;
s/X/*1024*1024*1024*1024*1024*1024*1024*1024*1024/g;
s/k/*1000/g;
s/m/*1000*1000/g;
s/g/*1000*1000*1000/g;
s/t/*1000*1000*1000*1000/g;
s/p/*1000*1000*1000*1000*1000/g;
s/e/*1000*1000*1000*1000*1000*1000/g;
s/z/*1000*1000*1000*1000*1000*1000*1000/g;
s/y/*1000*1000*1000*1000*1000*1000*1000*1000/g;
s/x/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g;
$_ = eval $_;
}
return wantarray ? @v : $v[0];
}
sub status {
my @w = @_;
my $fh = $Global::status_fd || *STDERR;
print $fh map { ($_, "\n") } @w;
flush $fh;
}
sub status_no_nl {
my @w = @_;
my $fh = $Global::status_fd || *STDERR;
print $fh @w;
flush $fh;
}
sub warning {
my @w = @_;
my $prog = $Global::progname || "parallel";
status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
}
sub error {
my @w = @_;
my $prog = $Global::progname || "parallel";
status(map { ($prog.": Error: ". $_); } @w);
}
sub die_bug {
my $bugid = shift;
print STDERR
("$Global::progname: This should not happen. You have found a bug.\n",
"Please submit a bug at https://gitlab.com/ole.tange/tangetools/-/issues\n",
"and include:\n",
"* The version number: $Global::version\n",
"* The bugid: $bugid\n",
"* The command line being run\n",
"* The files being read (put the files on a webserver if they are big)\n",
"\n",
"If you get the error on smaller/fewer files, please include those instead.\n");
exit(255);
}
sub version {
# Returns: N/A
print join("\n",
"$Global::progname $Global::version",
"Copyright (C) 2016-2020",
"Ole Tange and Free Software Foundation, Inc.",
"License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
"This is free software: you are free to change and redistribute it.",
"$Global::progname comes with no warranty.",
"",
"Web site: https://gitlab.com/ole.tange/tangetools/\n",
);
}
sub my_dump(@) {
# Returns:
# ascii expression of object if Data::Dump(er) is installed
# error code otherwise
my @dump_this = (@_);
eval "use Data::Dump qw(dump);";
if ($@) {
# Data::Dump not installed
eval "use Data::Dumper;";
if ($@) {
my $err = "Neither Data::Dump nor Data::Dumper is installed\n".
"Not dumping output\n";
::status($err);
return $err;
} else {
return Dumper(@dump_this);
}
} else {
# Create a dummy Data::Dump:dump as Hans Schou sometimes has
# it undefined
eval "sub Data::Dump:dump {}";
eval "use Data::Dump qw(dump);";
return (Data::Dump::dump(@dump_this));
}
}
sub debug(@) {
# Returns: N/A
$Global::debug or return;
print @_;
}

194
2search/regressiontest Executable file
View file

@ -0,0 +1,194 @@
#!/bin/bash
test_tmp=`tempfile`
export test_tmp
opt_tester() {
opt="$@"
tmp=$(tempfile)
test_2search() {
xargs echo Search in < $tmp
2search $opt $tmp 0 2 2.1 100000
2search $opt -B $tmp 0 2 2.1 100000
}
(true) |
sort $opt > $tmp
echo Search in null file
test_2search
(echo) |
sort $opt > $tmp
echo Search in newline
test_2search
(echo 1.000000000) |
sort $opt > $tmp
test_2search
(echo 1.000000000;
echo 2) |
sort $opt > $tmp
test_2search
(echo 1;
echo 2.000000000) |
sort $opt > $tmp
test_2search
(echo 1.000000000;
echo 2;
echo 3) |
sort $opt > $tmp
test_2search
(echo 1;
echo 2.000000000;
echo 3) |
sort $opt > $tmp
test_2search
(echo 1;
echo 2;
echo 3.000000000) |
sort $opt > $tmp
test_2search
rm $tmp
}
export -f opt_tester
test_n() {
tmp=${test_tmp}_n
true > $tmp
echo Search in null file
2search -n $tmp 0 2 2.1 100000
2search -nB $tmp 0 2 2.1 100000
echo > $tmp
xargs echo Search in newline
2search -n $tmp 0 2 2.1 100000
2search -nB $tmp 0 2 2.1 100000
echo 1.000000000 > $tmp
xargs echo Search in < $tmp
2search -n $tmp 0 2 2.1 100000
2search -nB $tmp 0 2 2.1 100000
echo 1.000000000 > $tmp
echo 2 >> $tmp
xargs echo Search in < $tmp
2search -n $tmp 0 2 2.1 100000
2search -nB $tmp 0 2 2.1 100000
echo 1 > $tmp
echo 2.000000000 >> $tmp
xargs echo Search in < $tmp
2search -n $tmp 0 2 2.1 100000
2search -nB $tmp 0 2 2.1 100000
echo 1.000000000 > $tmp
echo 2 >> $tmp
echo 3 >> $tmp
xargs echo Search in < $tmp
2search -n $tmp 0 2 2.1 100000
2search -nB $tmp 0 2 2.1 100000
echo 1 > $tmp
echo 2.000000000 >> $tmp
echo 3 >> $tmp
xargs echo Search in < $tmp
2search -n $tmp 0 2 2.1 100000
2search -nB $tmp 0 2 2.1 100000
echo 1 > $tmp
echo 2 >> $tmp
echo 3.000000000 >> $tmp
xargs echo Search in < $tmp
2search -n $tmp 0 2 2.1 100000
2search -nB $tmp 0 2 2.1 100000
rm $tmp
}
test_n_opt() {
opt_tester -n
}
test_rn_opt() {
opt_tester -rn
}
test_r_opt() {
opt_tester -rn
}
test_k32_2n_1n() {
tmp=$(tempfile)
cat >$tmp <<EOF
1 chr1 Sample 1
11 chr1 Sample 1
111 chr1 Sample 1
1111 chr1 Sample 1
11111 chr1 Sample 1
111111 chr1 Sample 1
1 chr2 Sample 1
22 chr2 Sample 1
111 chr2 Sample 1
2222 chr2 Sample 1
11111 chr2 Sample 1
111111 chr2 Sample 1
1 chr10 Sample 1
11 chr10 Sample 1
111 chr10 Sample 1
1111 chr10 Sample 1
11111 chr10 Sample 1
111111 chr10 Sample 1
1 chr1 Sample 2
11 chr1 Sample 2
111 chr1 Sample 2
1111 chr1 Sample 2
11111 chr1 Sample 2
111111 chr1 Sample 2
1 chr2 Sample 2
22 chr2 Sample 2
111 chr2 Sample 2
2222 chr2 Sample 2
11111 chr2 Sample 2
111111 chr2 Sample 2
1 chr10 Sample 2
11 chr10 Sample 2
111 chr10 Sample 2
1111 chr10 Sample 2
11111 chr10 Sample 2
111111 chr10 Sample 2
1 chr1 Sample 10
11 chr1 Sample 10
111 chr1 Sample 10
1111 chr1 Sample 10
11111 chr1 Sample 10
111111 chr1 Sample 10
1 chr2 Sample 10
22 chr2 Sample 10
111 chr2 Sample 10
2222 chr2 Sample 10
11111 chr2 Sample 10
111111 chr2 Sample 10
1 chr10 Sample 10
11 chr10 Sample 10
111 chr10 Sample 10
1111 chr10 Sample 10
11111 chr10 Sample 10
111111 chr10 Sample 10
EOF
2grep -k3N,2N,1n $tmp 'Sample 10' chr10 111
echo $tmp
}
test_partial_line() {
tmp=$(tempfile)
seq 100 | LC_ALL=C sort > $tmp
echo '### 2search --grep'
2search --grep $tmp 3
echo '### 2grep'
2grep $tmp 3
echo '### ... | 2grep'
echo 3 | 2grep $tmp
rm $tmp
}
export -f $(compgen -A function | grep test_)
compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1' > regressiontest.new
diff regressiontest.new regressiontest.out

280
2search/regressiontest.out Normal file
View file

@ -0,0 +1,280 @@
test_k32_2n_1n 111 chr10 Sample 10
test_k32_2n_1n 1111 chr10 Sample 10
test_k32_2n_1n 11111 chr10 Sample 10
test_k32_2n_1n 111111 chr10 Sample 10
test_n Search in null file
test_n 0
test_n 0
test_n 0
test_n 0
test_n Search in newline
test_n
test_n 0
test_n 1
test_n 1
test_n 1
test_n Search in 1.000000000
test_n 1.000000000
test_n 0
test_n 12
test_n 12
test_n 12
test_n Search in 1.000000000 2
test_n 1.000000000
test_n 2
test_n 0
test_n 12
test_n 14
test_n 14
test_n Search in 1 2.000000000
test_n 1
test_n 2.000000000
test_n 0
test_n 2
test_n 14
test_n 14
test_n Search in 1.000000000 2 3
test_n 1.000000000
test_n 2
test_n 3
test_n 0
test_n 12
test_n 14
test_n 16
test_n Search in 1 2.000000000 3
test_n 1
test_n 2.000000000
test_n 3
test_n 0
test_n 2
test_n 14
test_n 16
test_n Search in 1 2 3.000000000
test_n 1
test_n 2
test_n 3.000000000
test_n 0
test_n 2
test_n 4
test_n 16
test_n_opt Search in null file
test_n_opt Search in
test_n_opt 0
test_n_opt 0
test_n_opt 0
test_n_opt 0
test_n_opt Search in newline
test_n_opt Search in
test_n_opt
test_n_opt 0
test_n_opt 1
test_n_opt 1
test_n_opt 1
test_n_opt Search in 1.000000000
test_n_opt 1.000000000
test_n_opt 0
test_n_opt 12
test_n_opt 12
test_n_opt 12
test_n_opt Search in 1.000000000 2
test_n_opt 1.000000000
test_n_opt 2
test_n_opt 0
test_n_opt 12
test_n_opt 14
test_n_opt 14
test_n_opt Search in 1 2.000000000
test_n_opt 1
test_n_opt 2.000000000
test_n_opt 0
test_n_opt 2
test_n_opt 14
test_n_opt 14
test_n_opt Search in 1.000000000 2 3
test_n_opt 1.000000000
test_n_opt 2
test_n_opt 3
test_n_opt 0
test_n_opt 12
test_n_opt 14
test_n_opt 16
test_n_opt Search in 1 2.000000000 3
test_n_opt 1
test_n_opt 2.000000000
test_n_opt 3
test_n_opt 0
test_n_opt 2
test_n_opt 14
test_n_opt 16
test_n_opt Search in 1 2 3.000000000
test_n_opt 1
test_n_opt 2
test_n_opt 3.000000000
test_n_opt 0
test_n_opt 2
test_n_opt 4
test_n_opt 16
test_partial_line ### 2search --grep
test_partial_line 3
test_partial_line 30
test_partial_line 31
test_partial_line 32
test_partial_line 33
test_partial_line 34
test_partial_line 35
test_partial_line 36
test_partial_line 37
test_partial_line 38
test_partial_line 39
test_partial_line ### 2grep
test_partial_line 3
test_partial_line 30
test_partial_line 31
test_partial_line 32
test_partial_line 33
test_partial_line 34
test_partial_line 35
test_partial_line 36
test_partial_line 37
test_partial_line 38
test_partial_line 39
test_partial_line ### ... | 2grep
test_partial_line 3
test_partial_line 30
test_partial_line 31
test_partial_line 32
test_partial_line 33
test_partial_line 34
test_partial_line 35
test_partial_line 36
test_partial_line 37
test_partial_line 38
test_partial_line 39
test_rn_opt Search in null file
test_rn_opt Search in
test_rn_opt 0
test_rn_opt 0
test_rn_opt 0
test_rn_opt 0
test_rn_opt Search in newline
test_rn_opt Search in
test_rn_opt
test_rn_opt
test_rn_opt
test_rn_opt
test_rn_opt 0
test_rn_opt 0
test_rn_opt 0
test_rn_opt 0
test_rn_opt Search in 1.000000000
test_rn_opt 1.000000000
test_rn_opt 1.000000000
test_rn_opt 1.000000000
test_rn_opt 12
test_rn_opt 0
test_rn_opt 0
test_rn_opt 0
test_rn_opt Search in 2 1.000000000
test_rn_opt 2
test_rn_opt 2
test_rn_opt 2
test_rn_opt 14
test_rn_opt 0
test_rn_opt 0
test_rn_opt 0
test_rn_opt Search in 2.000000000 1
test_rn_opt 2.000000000
test_rn_opt 2.000000000
test_rn_opt 2.000000000
test_rn_opt 14
test_rn_opt 0
test_rn_opt 0
test_rn_opt 0
test_rn_opt Search in 3 2 1.000000000
test_rn_opt 2
test_rn_opt 2
test_rn_opt 3
test_rn_opt 16
test_rn_opt 2
test_rn_opt 2
test_rn_opt 0
test_rn_opt Search in 3 2.000000000 1
test_rn_opt 2.000000000
test_rn_opt 2.000000000
test_rn_opt 3
test_rn_opt 16
test_rn_opt 2
test_rn_opt 2
test_rn_opt 0
test_rn_opt Search in 3.000000000 2 1
test_rn_opt 2
test_rn_opt 2
test_rn_opt 3.000000000
test_rn_opt 16
test_rn_opt 12
test_rn_opt 12
test_rn_opt 0
test_r_opt Search in null file
test_r_opt Search in
test_r_opt 0
test_r_opt 0
test_r_opt 0
test_r_opt 0
test_r_opt Search in newline
test_r_opt Search in
test_r_opt
test_r_opt
test_r_opt
test_r_opt
test_r_opt 0
test_r_opt 0
test_r_opt 0
test_r_opt 0
test_r_opt Search in 1.000000000
test_r_opt 1.000000000
test_r_opt 1.000000000
test_r_opt 1.000000000
test_r_opt 12
test_r_opt 0
test_r_opt 0
test_r_opt 0
test_r_opt Search in 2 1.000000000
test_r_opt 2
test_r_opt 2
test_r_opt 2
test_r_opt 14
test_r_opt 0
test_r_opt 0
test_r_opt 0
test_r_opt Search in 2.000000000 1
test_r_opt 2.000000000
test_r_opt 2.000000000
test_r_opt 2.000000000
test_r_opt 14
test_r_opt 0
test_r_opt 0
test_r_opt 0
test_r_opt Search in 3 2 1.000000000
test_r_opt 2
test_r_opt 2
test_r_opt 3
test_r_opt 16
test_r_opt 2
test_r_opt 2
test_r_opt 0
test_r_opt Search in 3 2.000000000 1
test_r_opt 2.000000000
test_r_opt 2.000000000
test_r_opt 3
test_r_opt 16
test_r_opt 2
test_r_opt 2
test_r_opt 0
test_r_opt Search in 3.000000000 2 1
test_r_opt 2
test_r_opt 2
test_r_opt 3.000000000
test_r_opt 16
test_r_opt 12
test_r_opt 12
test_r_opt 0

View file

@ -1,21 +1,22 @@
CMD = blink bsearch burncpu duplicate-packets em encdir field forever \ CMD = blink 2grep 2search burncpu duplicate-packets em encdir field forever \
fxkill G gitnext gitundo goodpasswd histogram mtrr mirrorpdf \ fxkill G gitnext gitundo goodpasswd histogram mtrr mirrorpdf \
neno off pdfman pidcmd plotpipe puniq ramusage rand rclean \ neno off pdfman pidcmd plotpipe puniq ramusage rand rclean \
rina rn rrm seekmaniac shython sound-reload splitvideo stdout \ rina rn rrm seekmaniac shython sound-reload splitvideo stdout \
swapout T timestamp tracefile transpose upsidedown vid \ swapout T timestamp tracefile transpose upsidedown vid \
w4it-for-port-open whitehash wifi-reload wssh ytv yyyymmdd w4it-for-port-open whitehash wifi-reload wssh ytv yyyymmdd
all: blink/blink.1 bsearch/bsearch.1 burncpu/burncpu.1 \ all: blink/blink.1 2search/2grep.1 2search/2search.1 \
encdir/encdir.1 G/G.1 gitnext/gitnext.1 gitundo/gitundo.1 \ burncpu/burncpu.1 encdir/encdir.1 G/G.1 gitnext/gitnext.1 \
goodpasswd/goodpasswd.1 histogram/histogram.1 \ gitundo/gitundo.1 goodpasswd/goodpasswd.1 \
mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1 \ histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \
pidcmd/pidcmd.1 plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 \ off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 plotpipe/plotpipe.1 \
rina/rina.1 rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 \ puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 \
shython/shython.1 sound-reload/sound-reload.1 \ seekmaniac/seekmaniac.1 shython/shython.1 \
splitvideo/splitvideo.1 stdout/stdout.1 timestamp/timestamp.1 \ sound-reload/sound-reload.1 splitvideo/splitvideo.1 \
tracefile/tracefile.1 transpose/transpose.1 T/T.1 \ stdout/stdout.1 timestamp/timestamp.1 tracefile/tracefile.1 \
upsidedown/upsidedown.1 vid/vid.1 wifi-reload/wifi-reload.1 \ transpose/transpose.1 T/T.1 upsidedown/upsidedown.1 vid/vid.1 \
wssh/wssh.1 ytv/ytv.1 yyyymmdd/yyyymmdd.1 wifi-reload/wifi-reload.1 wssh/wssh.1 ytv/ytv.1 \
yyyymmdd/yyyymmdd.1
%.1: % %.1: %
pod2man $< > $@ pod2man $< > $@

View file

@ -1,404 +0,0 @@
#!/usr/bin/perl
=head1 NAME
bsearch - binary search through sorted text files
=head1 SYNOPSIS
B<bsearch> [-nrfB] file string [string...]
=head1 DESCRIPTION
B<bsearch> searches a sorted file for a string. It outputs the
following line or the byte position of this line, which is where the
string would have been if it had been in the sorted file.
=over 9
=item B<--ignore-leading-blanks> (not implemented)
=item B<-b>
ignore leading blanks
=item B<--byte-offset>
=item B<-B>
print byte position where string would have been
=item B<--dictionary-order> (not implemented)
=item B<-d>
consider only blanks and alphanumeric characters
=item B<--debug> (not implemented)
=item B<-D>
annotate the part of the line used to sort, and warn about
questionable usage to stderr
=item B<--ignore-case>
=item B<-f>
fold lower case to upper case characters
=item B<--general-numeric-sort> (not implemented)
=item B<-g>
compare according to general numerical value
=item B<--ignore-nonprinting> (not implemented)
=item B<-i>
consider only printable characters
=item B<--month-sort> (not implemented)
=item B<-M>
compare (unknown) < 'JAN' < ... < 'DEC'
=item B<--human-numeric-sort> (not implemented)
=item B<-h>
compare human readable numbers (e.g., 2K 1G)
=item B<--key=KEYDEF> (not implemented)
=item B<-k>
sort via a key; KEYDEF gives location and type
=item B<--numeric-sort>
=item B<-n>
compare according to string numerical value
=item B<--random-sort>
=item B<-R>
sort by random hash of keys
=item B<--reverse>
=item B<-r>
reverse the result of comparisons
=item B<--sort=WORD> (not implemented)
sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month
B<-M>, numeric B<-n>, random B<-R>, version B<-V>
=item B<-t> (not implemented)
=item B<--field-separator=SEP>
use SEP instead of non-blank to blank transition
=item B<-z>
=item B<--zero-terminated>
end lines with 0 byte, not newline
=back
=head1 EXAMPLES
=head2 Missing
Missing
=head1 REPORTING BUGS
B<bsearch> is part of tangetools. Report bugs to <tools@tange.dk>.
=head1 AUTHOR
Copyright (C) 2016 Ole Tange http://ole.tange.dk
=head1 LICENSE
Copyright (C) 2013 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head2 Documentation license I
Permission is granted to copy, distribute and/or modify this documentation
under the terms of the GNU Free Documentation License, Version 1.3 or
any later version published by the Free Software Foundation; with no
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
Texts. A copy of the license is included in the file fdl.txt.
=head2 Documentation license II
You are free:
=over 9
=item B<to Share>
to copy, distribute and transmit the work
=item B<to Remix>
to adapt the work
=back
Under the following conditions:
=over 9
=item B<Attribution>
You must attribute the work in the manner specified by the author or
licensor (but not in any way that suggests that they endorse you or
your use of the work).
=item B<Share Alike>
If you alter, transform, or build upon this work, you may distribute
the resulting work only under the same, similar or a compatible
license.
=back
With the understanding that:
=over 9
=item B<Waiver>
Any of the above conditions can be waived if you get permission from
the copyright holder.
=item B<Public Domain>
Where the work or any of its elements is in the public domain under
applicable law, that status is in no way affected by the license.
=item B<Other Rights>
In no way are any of the following rights affected by the license:
=over 9
=item *
Your fair dealing or fair use rights, or other applicable
copyright exceptions and limitations;
=item *
The author's moral rights;
=item *
Rights other persons may have either in the work itself or in
how the work is used, such as publicity or privacy rights.
=back
=item B<Notice>
For any reuse or distribution, you must make clear to others the
license terms of this work.
=back
A copy of the full license is included in the file as cc-by-sa.txt.
=head1 DEPENDENCIES
B<bsearch> uses Perl.
=head1 SEE ALSO
B<grep>(1), B<sort>(1).
=cut
use Getopt::Long;
Getopt::Long::Configure("bundling","require_order");
GetOptions(
"debug|D=s" => \$opt::D,
"version" => \$opt::version,
"verbose|v" => \$opt::verbose,
"B|byte-offset" => \$opt::byte_offset,
"b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
"d|dictionary-order" => \$opt::dictionary_order,
"f|ignore-case" => \$opt::ignore_case,
"g|general-numeric-sort" => \$opt::general_numeric_sort,
"i|ignore-nonprinting" => \$opt::ignore_nonprinting,
"M|month-sort" => \$opt::month_sort,
"h|human-numeric-sort" => \$opt::human_numeric_sort,
"n|numeric-sort" => \$opt::numeric_sort,
"r|reverse" => \$opt::reverse,
"R|random-sort" => \$opt::random_sort,
"sort=s" => \$opt::sort,
"V|version-sort" => \$opt::version_sort,
"k|key=s" => \@opt::key,
"t|field-separator=s" => \$opt::field_separator,
"z|zero-terminated" => \$opt::zero_terminated,
);
$Global::progname = "bsearch";
$Global::version = 20160712;
if($opt::version) {
version();
exit 0;
}
if($opt::zero_terminated) { $/ = "\0"; }
my $file = shift;
for my $key (@ARGV) {
print bsearch($file,$key);
}
sub bsearch {
my $file = shift;
my $key = shift;
my $min = 0;
my $max = -s $file;
if(not open ($fh, "<", $file)) {
error("Cannot open '$file'");
exit 1;
}
my $line;
while($max - $min > 1) {
$middle = int(($max + $min)/2);
seek($fh,$middle,0) or die;
my $half = <$fh>;
if(eof($fh)
or
compare(($line = <$fh>),$key) >= 0) {
$max = $middle;
} else {
$min = $middle;
}
}
seek($fh,$max,0) or die;
$line = <$fh>;
if(compare($line,$key) >= 0) {
if($opt::byte_offset) {
return "0\n";
} else {
# The very first line
return "";
}
} else {
if($opt::byte_offset) {
return tell($fh)."\n";
} else {
return $line;
}
}
}
sub compare {
my ($a,$b) = @_;
if($opt::random_sort) {
return rand() <=> rand();
}
if($opt::reverse) {
($a,$b) = ($b,$a);
}
if($opt::ignore_case) {
$a = uc($a);
$b = uc($b);
}
if($opt::numeric_sort) {
return $a <=> $b;
} elsif($opt::numascii) {
return $a <=> $b or $a cmp $b;
} else {
return $a cmp $b;
}
}
sub status {
my @w = @_;
my $fh = $Global::status_fd || *STDERR;
print $fh map { ($_, "\n") } @w;
flush $fh;
}
sub status_no_nl {
my @w = @_;
my $fh = $Global::status_fd || *STDERR;
print $fh @w;
flush $fh;
}
sub warning {
my @w = @_;
my $prog = $Global::progname || "parallel";
status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
}
sub error {
my @w = @_;
my $prog = $Global::progname || "parallel";
status(map { ($prog.": Error: ". $_); } @w);
}
sub die_bug {
my $bugid = shift;
print STDERR
("$Global::progname: This should not happen. You have found a bug.\n",
"Please contact <parallel\@gnu.org> and include:\n",
"* The version number: $Global::version\n",
"* The bugid: $bugid\n",
"* The command line being run\n",
"* The files being read (put the files on a webserver if they are big)\n",
"\n",
"If you get the error on smaller/fewer files, please include those instead.\n");
::wait_and_exit(255);
}
sub version {
# Returns: N/A
print join("\n",
"GNU $Global::progname $Global::version",
"Copyright (C) 2016",
"Ole Tange and Free Software Foundation, Inc.",
"License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
"This is free software: you are free to change and redistribute it.",
"GNU $Global::progname comes with no warranty.",
"",
"Web site: http://www.gnu.org/software/${Global::progname}\n",
"When using programs that use GNU Parallel to process data for publication",
"please cite as described in 'parallel --citation'.\n",
);
}

View file

@ -1,44 +0,0 @@
#!/bin/bash
test_tmp=`tempfile`
export test_tmp
test_n() {
tmp=${test_tmp}_n
true > $tmp
bsearch -n $tmp 0 2 2.1 100000
echo > $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1.000000000 > $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1.000000000 > $tmp
echo 2 >> $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1 > $tmp
echo 2.000000000 >> $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1.000000000 > $tmp
echo 2 >> $tmp
echo 3 >> $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1 > $tmp
echo 2.000000000 >> $tmp
echo 3 >> $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1 > $tmp
echo 2 >> $tmp
echo 3.000000000 >> $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
rm $tmp
}
export -f $(compgen -A function | grep test_)
compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1'