From 5020d3fbe78f8de7f620bf7974a8d8a9597f7e36 Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Mon, 11 Jul 2016 18:42:41 +0200 Subject: [PATCH 1/2] bsearch: binary search in sorted text files. Initial version. --- README | 2 + bsearch/bsearch | 146 +++++++++++++++++++++++++++++++++++++++++ bsearch/regressiontest | 43 ++++++++++++ 3 files changed, 191 insertions(+) create mode 100755 bsearch/bsearch create mode 100755 bsearch/regressiontest diff --git a/README b/README index 293ddc5..43924d5 100644 --- a/README +++ b/README @@ -2,6 +2,8 @@ Tools developed by Ole Tange . Probably not useful for you, but then again you never now. +bsearch - binary search through sorted text files. + em - Force emacs to run in terminal. Use xemacs if installed. field - Split on space. Give the given field number. Support syntax 1-3,6- diff --git a/bsearch/bsearch b/bsearch/bsearch new file mode 100755 index 0000000..ac1fcb2 --- /dev/null +++ b/bsearch/bsearch @@ -0,0 +1,146 @@ +#!/usr/bin/perl + +use Getopt::Long; + + +GetOptions( + "debug|D=s" => \$opt::D, + "version" => \$opt::version, + "verbose|v" => \$opt::verbose, + "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks, + "d|dictionary-order" => \$opt::dictionary_order, + "f|ignore-case" => \$opt::ignore_case, + "g|general-numeric-sort" => \$opt::general_numeric_sort, + "i|ignore-nonprinting" => \$opt::ignore_nonprinting, + "M|month-sort" => \$opt::month_sort, + "h|human-numeric-sort" => \$opt::human_numeric_sort, + "n|numeric-sort" => \$opt::numeric_sort, + "r|reverse" => \$opt::reverse, + "sort=s" => \$opt::sort, + "V|version-sort" => \$opt::version_sort, + "k|key=s" => \@opt::key, + "t|field-separator=s" => \$opt::field_separator, + "z|zero-terminated" => \$opt::zero_terminated, + ); +$Global::progname = "bsearch"; +$Global::version = 20160712; +if($opt::version) { + version(); + exit 0; +} + +my $file = shift; + +for my $key (@ARGV) { + print bsearch($file,$key),"\n"; +} + +sub bsearch { + my $file = shift; + my $key = shift; + my $min = 0; + my $max = -s $file; + + if(not open ($fh, "<", $file)) { + error("Cannot open '$file'"); + exit 1; + } + my $line; + while($max - $min > 1) { + $middle = int(($max+$min)/2); + seek($fh,$middle,0) or die; + my $half = <$fh>; + if(eof($fh) + or + compare(($line = <$fh>),$key) >= 0) { + $max = $middle; + } else { + $min = $middle; + } + } + seek($fh,$max,0) or die; + $line = <$fh>; + if(compare($line,$key) >= 0) { + return 0; + } else { + return tell $fh; + } +} + +# -n, --numeric-sort +# -r --reverse +# -f, --ignore-case + +sub compare { + my ($a,$b) = @_; + if($opt::reverse) { + ($a,$b) = ($b,$a); + } + if($opt::ignore_case) { + $a = uc($a); + $b = uc($b); + } + if($opt::numeric_sort) { + return $a <=> $b; + } elsif($opt::numascii) { + return $a <=> $b or $a cmp $b; + } else { + return $a cmp $b; + } +} + +sub status { + my @w = @_; + my $fh = $Global::status_fd || *STDERR; + print $fh map { ($_, "\n") } @w; + flush $fh; +} + +sub status_no_nl { + my @w = @_; + my $fh = $Global::status_fd || *STDERR; + print $fh @w; + flush $fh; +} + +sub warning { + my @w = @_; + my $prog = $Global::progname || "parallel"; + status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w); +} + +sub error { + my @w = @_; + my $prog = $Global::progname || "parallel"; + status(map { ($prog.": Error: ". $_); } @w); +} + +sub die_bug { + my $bugid = shift; + print STDERR + ("$Global::progname: This should not happen. You have found a bug.\n", + "Please contact and include:\n", + "* The version number: $Global::version\n", + "* The bugid: $bugid\n", + "* The command line being run\n", + "* The files being read (put the files on a webserver if they are big)\n", + "\n", + "If you get the error on smaller/fewer files, please include those instead.\n"); + ::wait_and_exit(255); +} + +sub version { + # Returns: N/A + print join("\n", + "GNU $Global::progname $Global::version", + "Copyright (C) 2016", + "Ole Tange and Free Software Foundation, Inc.", + "License GPLv3+: GNU GPL version 3 or later ", + "This is free software: you are free to change and redistribute it.", + "GNU $Global::progname comes with no warranty.", + "", + "Web site: http://www.gnu.org/software/${Global::progname}\n", + "When using programs that use GNU Parallel to process data for publication", + "please cite as described in 'parallel --citation'.\n", + ); +} diff --git a/bsearch/regressiontest b/bsearch/regressiontest new file mode 100755 index 0000000..d95d7e9 --- /dev/null +++ b/bsearch/regressiontest @@ -0,0 +1,43 @@ +#!/bin/bash + +test_tmp=`tempfile` +export test_tmp + +test_n() { + tmp=${test_tmp}_n + true > $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo > $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1.000000000 > $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1.000000000 > $tmp + echo 2 >> $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1 > $tmp + echo 2.000000000 >> $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1.000000000 > $tmp + echo 2 >> $tmp + echo 3 >> $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1 > $tmp + echo 2.000000000 >> $tmp + echo 3 >> $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1 > $tmp + echo 2 >> $tmp + echo 3.000000000 >> $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 +} + + +export -f $(compgen -A function | grep test_) +compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1' From 1e87eb8f2ce3331a882314cff57cf0195d5e2464 Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Mon, 11 Jul 2016 19:38:12 +0200 Subject: [PATCH 2/2] bsearch: Initial man page. --- Makefile | 6 +- bsearch/bsearch | 270 +++++++++++++++++++++++++++++++++++++++-- bsearch/regressiontest | 1 + 3 files changed, 266 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index a6ebeca..918744a 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,12 @@ -CMD = blink histogram upsidedown tracefile timestamp rand rrm goodpasswd +CMD = blink bsearch histogram upsidedown tracefile timestamp rand rrm goodpasswd -all: blink/blink.1 goodpasswd/goodpasswd.1 histogram/histogram.1 rand/rand.1 rrm/rrm.1 timestamp/timestamp.1 tracefile/tracefile.1 upsidedown/upsidedown.1 wssh/wssh.1 +all: blink/blink.1 bsearch/bsearch.1 goodpasswd/goodpasswd.1 histogram/histogram.1 rand/rand.1 rrm/rrm.1 timestamp/timestamp.1 tracefile/tracefile.1 upsidedown/upsidedown.1 wssh/wssh.1 %.1: % pod2man $< > $@ install: mkdir -p /usr/local/bin - parallel eval ln -sf `pwd`/*/{} /usr/local/bin/{} ::: blink reniced em field forever neno rn stdout tracefile w4it-for-port-open upsidedown histogram goodpasswd mtrr not summer timestamp transpose wssh aptsearch rand rrm + parallel eval ln -sf `pwd`/*/{} /usr/local/bin/{} ::: blink bsearch reniced em field forever neno rn stdout tracefile w4it-for-port-open upsidedown histogram goodpasswd mtrr not summer timestamp transpose wssh aptsearch rand rrm mkdir -p /usr/local/share/man/man1 parallel ln -sf `pwd`/{} /usr/local/share/man/man1/{/} ::: */*.1 diff --git a/bsearch/bsearch b/bsearch/bsearch index ac1fcb2..dba8e4e 100755 --- a/bsearch/bsearch +++ b/bsearch/bsearch @@ -1,12 +1,260 @@ #!/usr/bin/perl +=head1 NAME + +bsearch - binary search through sorted text files + +=head1 SYNOPSIS + +B [-nrfB] file string [string...] + +=head1 DESCRIPTION + +B searches a sorted file for a string. It outputs the +following line or the byte position of this line, which is where the +string would have been if it had been in the sorted file. + +=over 9 + +=item B<--ignore-leading-blanks> (not implemented) + +=item B<-b> + +ignore leading blanks + +=item B<--byte-offset> + +=item B<-B> + +print byte position where string would have been + +=item B<--dictionary-order> (not implemented) + +=item B<-d> + +consider only blanks and alphanumeric characters + +=item B<--debug> (not implemented) + +=item B<-D> + +annotate the part of the line used to sort, and warn about +questionable usage to stderr + +=item B<--ignore-case> + +=item B<-f> + +fold lower case to upper case characters + +=item B<--general-numeric-sort> (not implemented) + +=item B<-g> + +compare according to general numerical value + +=item B<--ignore-nonprinting> (not implemented) + +=item B<-i> + +consider only printable characters + +=item B<--month-sort> (not implemented) + +=item B<-M> + +compare (unknown) < 'JAN' < ... < 'DEC' + +=item B<--human-numeric-sort> (not implemented) + +=item B<-h> + +compare human readable numbers (e.g., 2K 1G) + +=item B<--key=KEYDEF> (not implemented) + +=item B<-k> + +sort via a key; KEYDEF gives location and type + +=item B<--numeric-sort> + +=item B<-n> + +compare according to string numerical value + +=item B<--random-sort> (not implemented) + +=item B<-R> + +sort by random hash of keys + +=item B<--reverse> + +=item B<-r> + +reverse the result of comparisons + +=item B<--sort=WORD> (not implemented) + +sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month +B<-M>, numeric B<-n>, random B<-R>, version B<-V> + +=item B<-t> (not implemented) + +=item B<--field-separator=SEP> + +use SEP instead of non-blank to blank transition + +=item B<-z> (not implemented) + +=item B<--zero-terminated> + +end lines with 0 byte, not newline + +=back + +=head1 EXAMPLES + +=head2 Missing + +Missing + + +=head1 REPORTING BUGS + +B is part of tangetools. Report bugs to . + + +=head1 AUTHOR + +Copyright (C) 2016 Ole Tange http://ole.tange.dk + + +=head1 LICENSE + +Copyright (C) 2013 Free Software Foundation, Inc. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +at your option any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +=head2 Documentation license I + +Permission is granted to copy, distribute and/or modify this documentation +under the terms of the GNU Free Documentation License, Version 1.3 or +any later version published by the Free Software Foundation; with no +Invariant Sections, with no Front-Cover Texts, and with no Back-Cover +Texts. A copy of the license is included in the file fdl.txt. + +=head2 Documentation license II + +You are free: + +=over 9 + +=item B + +to copy, distribute and transmit the work + +=item B + +to adapt the work + +=back + +Under the following conditions: + +=over 9 + +=item B + +You must attribute the work in the manner specified by the author or +licensor (but not in any way that suggests that they endorse you or +your use of the work). + +=item B + +If you alter, transform, or build upon this work, you may distribute +the resulting work only under the same, similar or a compatible +license. + +=back + +With the understanding that: + +=over 9 + +=item B + +Any of the above conditions can be waived if you get permission from +the copyright holder. + +=item B + +Where the work or any of its elements is in the public domain under +applicable law, that status is in no way affected by the license. + +=item B + +In no way are any of the following rights affected by the license: + +=over 9 + +=item * + +Your fair dealing or fair use rights, or other applicable +copyright exceptions and limitations; + +=item * + +The author's moral rights; + +=item * + +Rights other persons may have either in the work itself or in +how the work is used, such as publicity or privacy rights. + +=back + +=item B + +For any reuse or distribution, you must make clear to others the +license terms of this work. + +=back + +A copy of the full license is included in the file as cc-by-sa.txt. + +=head1 DEPENDENCIES + +B uses Perl and the Getopt::Long and Time::HiRes modules. + + +=head1 SEE ALSO + +B(1) + +=cut + use Getopt::Long; +Getopt::Long::Configure("bundling","require_order"); GetOptions( "debug|D=s" => \$opt::D, "version" => \$opt::version, "verbose|v" => \$opt::verbose, + "B|byte-offset" => \$opt::byte_offset, "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks, "d|dictionary-order" => \$opt::dictionary_order, "f|ignore-case" => \$opt::ignore_case, @@ -28,11 +276,12 @@ if($opt::version) { version(); exit 0; } +if($opt::zero_terminated) { $/ = "\0"; } my $file = shift; for my $key (@ARGV) { - print bsearch($file,$key),"\n"; + print bsearch($file,$key); } sub bsearch { @@ -47,7 +296,7 @@ sub bsearch { } my $line; while($max - $min > 1) { - $middle = int(($max+$min)/2); + $middle = int(($max + $min)/2); seek($fh,$middle,0) or die; my $half = <$fh>; if(eof($fh) @@ -61,16 +310,21 @@ sub bsearch { seek($fh,$max,0) or die; $line = <$fh>; if(compare($line,$key) >= 0) { - return 0; + if($opt::byte_offset) { + return "0\n"; + } else { + # The very first line + return ""; + } } else { - return tell $fh; + if($opt::byte_offset) { + return tell($fh)."\n"; + } else { + return $line; + } } } -# -n, --numeric-sort -# -r --reverse -# -f, --ignore-case - sub compare { my ($a,$b) = @_; if($opt::reverse) { diff --git a/bsearch/regressiontest b/bsearch/regressiontest index d95d7e9..6d77046 100755 --- a/bsearch/regressiontest +++ b/bsearch/regressiontest @@ -36,6 +36,7 @@ test_n() { echo 3.000000000 >> $tmp xargs < $tmp bsearch -n $tmp 0 2 2.1 100000 + rm $tmp }