tangetools/bsearch/bsearch
2018-01-04 01:32:04 +01:00

405 lines
8.5 KiB
Perl
Executable file

#!/usr/bin/perl
=head1 NAME
bsearch - binary search through sorted text files
=head1 SYNOPSIS
B<bsearch> [-nrfB] file string [string...]
=head1 DESCRIPTION
B<bsearch> searches a sorted file for a string. It outputs the
following line or the byte position of this line, which is where the
string would have been if it had been in the sorted file.
=over 9
=item B<--ignore-leading-blanks> (not implemented)
=item B<-b>
ignore leading blanks
=item B<--byte-offset>
=item B<-B>
print byte position where string would have been
=item B<--dictionary-order> (not implemented)
=item B<-d>
consider only blanks and alphanumeric characters
=item B<--debug> (not implemented)
=item B<-D>
annotate the part of the line used to sort, and warn about
questionable usage to stderr
=item B<--ignore-case>
=item B<-f>
fold lower case to upper case characters
=item B<--general-numeric-sort> (not implemented)
=item B<-g>
compare according to general numerical value
=item B<--ignore-nonprinting> (not implemented)
=item B<-i>
consider only printable characters
=item B<--month-sort> (not implemented)
=item B<-M>
compare (unknown) < 'JAN' < ... < 'DEC'
=item B<--human-numeric-sort> (not implemented)
=item B<-h>
compare human readable numbers (e.g., 2K 1G)
=item B<--key=KEYDEF> (not implemented)
=item B<-k>
sort via a key; KEYDEF gives location and type
=item B<--numeric-sort>
=item B<-n>
compare according to string numerical value
=item B<--random-sort>
=item B<-R>
sort by random hash of keys
=item B<--reverse>
=item B<-r>
reverse the result of comparisons
=item B<--sort=WORD> (not implemented)
sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month
B<-M>, numeric B<-n>, random B<-R>, version B<-V>
=item B<-t> (not implemented)
=item B<--field-separator=SEP>
use SEP instead of non-blank to blank transition
=item B<-z>
=item B<--zero-terminated>
end lines with 0 byte, not newline
=back
=head1 EXAMPLES
=head2 Missing
Missing
=head1 REPORTING BUGS
B<bsearch> is part of tangetools. Report bugs to <tools@tange.dk>.
=head1 AUTHOR
Copyright (C) 2016 Ole Tange http://ole.tange.dk
=head1 LICENSE
Copyright (C) 2013 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head2 Documentation license I
Permission is granted to copy, distribute and/or modify this documentation
under the terms of the GNU Free Documentation License, Version 1.3 or
any later version published by the Free Software Foundation; with no
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
Texts. A copy of the license is included in the file fdl.txt.
=head2 Documentation license II
You are free:
=over 9
=item B<to Share>
to copy, distribute and transmit the work
=item B<to Remix>
to adapt the work
=back
Under the following conditions:
=over 9
=item B<Attribution>
You must attribute the work in the manner specified by the author or
licensor (but not in any way that suggests that they endorse you or
your use of the work).
=item B<Share Alike>
If you alter, transform, or build upon this work, you may distribute
the resulting work only under the same, similar or a compatible
license.
=back
With the understanding that:
=over 9
=item B<Waiver>
Any of the above conditions can be waived if you get permission from
the copyright holder.
=item B<Public Domain>
Where the work or any of its elements is in the public domain under
applicable law, that status is in no way affected by the license.
=item B<Other Rights>
In no way are any of the following rights affected by the license:
=over 9
=item *
Your fair dealing or fair use rights, or other applicable
copyright exceptions and limitations;
=item *
The author's moral rights;
=item *
Rights other persons may have either in the work itself or in
how the work is used, such as publicity or privacy rights.
=back
=item B<Notice>
For any reuse or distribution, you must make clear to others the
license terms of this work.
=back
A copy of the full license is included in the file as cc-by-sa.txt.
=head1 DEPENDENCIES
B<bsearch> uses Perl.
=head1 SEE ALSO
B<grep>(1), B<sort>(1).
=cut
use Getopt::Long;
Getopt::Long::Configure("bundling","require_order");
GetOptions(
"debug|D=s" => \$opt::D,
"version" => \$opt::version,
"verbose|v" => \$opt::verbose,
"B|byte-offset" => \$opt::byte_offset,
"b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
"d|dictionary-order" => \$opt::dictionary_order,
"f|ignore-case" => \$opt::ignore_case,
"g|general-numeric-sort" => \$opt::general_numeric_sort,
"i|ignore-nonprinting" => \$opt::ignore_nonprinting,
"M|month-sort" => \$opt::month_sort,
"h|human-numeric-sort" => \$opt::human_numeric_sort,
"n|numeric-sort" => \$opt::numeric_sort,
"r|reverse" => \$opt::reverse,
"R|random-sort" => \$opt::random_sort,
"sort=s" => \$opt::sort,
"V|version-sort" => \$opt::version_sort,
"k|key=s" => \@opt::key,
"t|field-separator=s" => \$opt::field_separator,
"z|zero-terminated" => \$opt::zero_terminated,
);
$Global::progname = "bsearch";
$Global::version = 20160712;
if($opt::version) {
version();
exit 0;
}
if($opt::zero_terminated) { $/ = "\0"; }
my $file = shift;
for my $key (@ARGV) {
print bsearch($file,$key);
}
sub bsearch {
my $file = shift;
my $key = shift;
my $min = 0;
my $max = -s $file;
if(not open ($fh, "<", $file)) {
error("Cannot open '$file'");
exit 1;
}
my $line;
while($max - $min > 1) {
$middle = int(($max + $min)/2);
seek($fh,$middle,0) or die;
my $half = <$fh>;
if(eof($fh)
or
compare(($line = <$fh>),$key) >= 0) {
$max = $middle;
} else {
$min = $middle;
}
}
seek($fh,$max,0) or die;
$line = <$fh>;
if(compare($line,$key) >= 0) {
if($opt::byte_offset) {
return "0\n";
} else {
# The very first line
return "";
}
} else {
if($opt::byte_offset) {
return tell($fh)."\n";
} else {
return $line;
}
}
}
sub compare {
my ($a,$b) = @_;
if($opt::random_sort) {
return rand() <=> rand();
}
if($opt::reverse) {
($a,$b) = ($b,$a);
}
if($opt::ignore_case) {
$a = uc($a);
$b = uc($b);
}
if($opt::numeric_sort) {
return $a <=> $b;
} elsif($opt::numascii) {
return $a <=> $b or $a cmp $b;
} else {
return $a cmp $b;
}
}
sub status {
my @w = @_;
my $fh = $Global::status_fd || *STDERR;
print $fh map { ($_, "\n") } @w;
flush $fh;
}
sub status_no_nl {
my @w = @_;
my $fh = $Global::status_fd || *STDERR;
print $fh @w;
flush $fh;
}
sub warning {
my @w = @_;
my $prog = $Global::progname || "parallel";
status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
}
sub error {
my @w = @_;
my $prog = $Global::progname || "parallel";
status(map { ($prog.": Error: ". $_); } @w);
}
sub die_bug {
my $bugid = shift;
print STDERR
("$Global::progname: This should not happen. You have found a bug.\n",
"Please contact <parallel\@gnu.org> and include:\n",
"* The version number: $Global::version\n",
"* The bugid: $bugid\n",
"* The command line being run\n",
"* The files being read (put the files on a webserver if they are big)\n",
"\n",
"If you get the error on smaller/fewer files, please include those instead.\n");
::wait_and_exit(255);
}
sub version {
# Returns: N/A
print join("\n",
"GNU $Global::progname $Global::version",
"Copyright (C) 2016",
"Ole Tange and Free Software Foundation, Inc.",
"License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
"This is free software: you are free to change and redistribute it.",
"GNU $Global::progname comes with no warranty.",
"",
"Web site: http://www.gnu.org/software/${Global::progname}\n",
"When using programs that use GNU Parallel to process data for publication",
"please cite as described in 'parallel --citation'.\n",
);
}