From e8f520f642d139383b72a2677afa18b74a7114ed Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Sat, 28 Mar 2020 15:37:52 +0100 Subject: [PATCH] 2search: bsearch/bgrep renamed to 2search/2grep (bgrep is used by others). --- 2search/2grep | 777 +++++++++++++++++++++++++++++++++++++ 2search/2search | 777 +++++++++++++++++++++++++++++++++++++ 2search/regressiontest | 194 +++++++++ 2search/regressiontest.out | 280 +++++++++++++ Makefile | 25 +- bsearch/bsearch | 404 ------------------- bsearch/regressiontest | 44 --- 7 files changed, 2041 insertions(+), 460 deletions(-) create mode 100755 2search/2grep create mode 100755 2search/2search create mode 100755 2search/regressiontest create mode 100644 2search/regressiontest.out delete mode 100755 bsearch/bsearch delete mode 100755 bsearch/regressiontest diff --git a/2search/2grep b/2search/2grep new file mode 100755 index 0000000..ff5ca56 --- /dev/null +++ b/2search/2grep @@ -0,0 +1,777 @@ +#!/usr/bin/perl + +=head1 NAME + +2search - binary search through sorted text files + +2grep - binary search+grep through sorted text files + +=head1 SYNOPSIS + +B<2search> [-nrfB] file string [string...] + +B<2search> --grep [-nrf] file string [string...] + +B<2grep> [-nrf] file string [string...] + +... | B<2search> [-nrfB] file + +... | B<2search> --grep [-nrf] file + +... | B<2grep> [-nrf] file + +=head1 DESCRIPTION + +B<2search> searches a sorted file for a string. It outputs the +following line or the byte position of this line, which is where the +string would have been if it had been in the sorted file. + +B<2grep> output all lines starting with a given string. The file must +be sorted. + +=over 9 + +=item B<--ignore-leading-blanks> + +=item B<-b> + +ignore leading blanks + + +=item B<--byte-offset> + +=item B<-B> + +print byte position where string would have been + + +=item B<--dictionary-order> (not implemented) + +=item B<-d> + +consider only blanks and alphanumeric characters + + +=item B<--debug> (not implemented) + +=item B<-D> + +annotate the part of the line used to sort, and warn about +questionable usage to stderr + + +=item B<--ignore-case> + +=item B<-f> + +fold lower case to upper case characters + + +=item B<--file> I + +=item B<-F> I + +search for all lines in I + + +=item B<--general-numeric-sort> (not implemented) + +=item B<-g> + +compare according to general numerical value + + +=item B<--ignore-nonprinting> (not implemented) + +=item B<-i> + +consider only printable characters + + +=item B<--month-sort> + +=item B<-M> + +compare (unknown) < 'JAN' < ... < 'DEC' + + +=item B<--human-numeric-sort> + +=item B<-h> + +compare human readable numbers (e.g., 2K 1G) + + +=item B<--key=KEYDEF> (not implemented) + +=item B<-k> + +sort via a key; KEYDEF gives location and type + + +=item B<--numeric-sort> + +=item B<-n> + +compare according to string numerical value. If numerical values are +the same: split the string into blocks of numbers and non-numbers, and +compare numbers as numbers and strings as strings. + +This will sort like this: chr3 chr11 3chr 11chr + + +=item B<--numascii> + +=item B<-N> + +compare according to string numerical value. If numerical values are +the same: compare as strings + + +=item B<--random-sort> + +=item B<-R> + +sort by random hash of keys + + +=item B<--reverse> + +=item B<-r> + +reverse the result of comparisons + + +=item B<--sort=WORD> (not implemented) + +sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month +B<-M>, numeric B<-n>, random B<-R>, version B<-V> + + +=item B<-t> + +=item B<--field-separator=SEP> + +use SEP instead of non-blank to blank transition + + +=item B<-z> + +=item B<--zero-terminated> + +end lines with 0 byte, not newline + +=back + +=head1 EXAMPLES + +=head2 Single key + +Input is sorted by Chromosome,Position: + + SampleID Position Chromosome + foo 10000123 chr3 + foo 10000125 chr3 + foo 9999998 chr11 + foo 10000124 chr11 + foo 10000126 chr11 + +To find all chr3: + + 2grep -n -k3 inputfile chr3 + +-n will split 'chr3' into 'chr' which is compared asciibetically and +'3' which is compared numerically. + +=head2 Not implemented + +To find all lines with chr3,10000125: + + 2grep -k3n,2n inputfile chr3 10000125 + + + +=head1 REPORTING BUGS + +B<2search> is part of tangetools. Report bugs to . + + +=head1 AUTHOR + +Copyright (C) 2016-2020 Ole Tange http://ole.tange.dk + + +=head1 LICENSE + +Copyright (C) 2013 Free Software Foundation, Inc. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +at your option any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +=head2 Documentation license I + +Permission is granted to copy, distribute and/or modify this documentation +under the terms of the GNU Free Documentation License, Version 1.3 or +any later version published by the Free Software Foundation; with no +Invariant Sections, with no Front-Cover Texts, and with no Back-Cover +Texts. A copy of the license is included in the file fdl.txt. + +=head2 Documentation license II + +You are free: + +=over 9 + +=item B + +to copy, distribute and transmit the work + +=item B + +to adapt the work + +=back + +Under the following conditions: + +=over 9 + +=item B + +You must attribute the work in the manner specified by the author or +licensor (but not in any way that suggests that they endorse you or +your use of the work). + +=item B + +If you alter, transform, or build upon this work, you may distribute +the resulting work only under the same, similar or a compatible +license. + +=back + +With the understanding that: + +=over 9 + +=item B + +Any of the above conditions can be waived if you get permission from +the copyright holder. + +=item B + +Where the work or any of its elements is in the public domain under +applicable law, that status is in no way affected by the license. + +=item B + +In no way are any of the following rights affected by the license: + +=over 9 + +=item * + +Your fair dealing or fair use rights, or other applicable +copyright exceptions and limitations; + +=item * + +The author's moral rights; + +=item * + +Rights other persons may have either in the work itself or in +how the work is used, such as publicity or privacy rights. + +=back + +=item B + +For any reuse or distribution, you must make clear to others the +license terms of this work. + +=back + +A copy of the full license is included in the file as cc-by-sa.txt. + +=head1 DEPENDENCIES + +B<2search>/B<2grep> uses Perl. + + +=head1 SEE ALSO + +B(1), B(1). + +=cut + +use strict; +use Getopt::Long; + +Getopt::Long::Configure("bundling","require_order"); + +GetOptions( + "debug|D" => \$opt::D, + "version" => \$opt::version, + "verbose|v" => \$opt::verbose, + "B|byte-offset" => \$opt::byte_offset, + "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks, + "d|dictionary-order" => \$opt::dictionary_order, + "f|ignore-case" => \$opt::ignore_case, + "g|general-numeric-sort" => \$opt::general_numeric_sort, + "G|grep" => \$opt::grep, + "F|file=s" => \$opt::file, + "i|ignore-nonprinting" => \$opt::ignore_nonprinting, + "M|month-sort" => \$opt::month_sort, + "h|human-numeric-sort" => \$opt::human_numeric_sort, + "n|numeric-sort" => \$opt::numeric_sort, + "N|numascii" => \$opt::numascii, + "r|reverse" => \$opt::reverse, + "R|random-sort" => \$opt::random_sort, + "sort=s" => \$opt::sort, + "V|version-sort" => \$opt::version_sort, + "k|key=s" => \@opt::key, + "t|field-separator=s" => \$opt::field_separator, + "z|zero-terminated" => \$opt::zero_terminated, + ); +$Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1]; +$Global::version = 20200328; +if($opt::version) { version(); exit 0; } +if($opt::zero_terminated) { $/ = "\0"; } +if(@opt::key) { + # Default separator if --key = whitespace + $Global::sep = '\s+'; + if(defined $opt::field_separator) { $Global::sep = $opt::field_separator; } +} +if($Global::progname eq "2grep") { $opt::grep = 1; } +$Global::debug = $opt::D; + +parse_keydef(); + +debug(my_dump(\@Global::keydefs),"\n"); + +my $file = shift; +if(@ARGV) { + $opt::argv = 1; +} elsif(defined $opt::file) { + # skip +} else { + $opt::stdin = 1; +} + + round: + while(1) { + my @search_vals; + for(@Global::keydefs) { + my $val = get(); + if(not defined $val) { + last round; + } + push @search_vals, $val; + } + if($opt::grep) { + bgrep($file,@search_vals); + } else { + print bsearch($file,@search_vals); + } +} + +{ + my $fh; + + sub get { + if($opt::argv) { + # Search for strings on the command line + return shift @ARGV; + } + if($opt::file) { + # Search for strings given with --file + if(not $fh) { + if(not open(my $fh, "<", $opt::file)) { + error("Cannot open $opt::file"); + exit(255); + } + } + my $val = <$fh>; + chomp $val; + return $val; + } + if($opt::stdin) { + # Search for strings on stdin + my $val = <>; + chomp $val; + return $val; + } + die; + } +} + +sub bgrep { + my $file = shift; + my @search_vals = @_; + $opt::byte_offset = 1; + my $startpos = bsearch($file,@search_vals); + my $fh; + if(not open ($fh, "<", $file)) { + error("Cannot open '$file'"); + exit 1; + } + seek($fh,$startpos,0) or die; + # Allow for partial matches in grep (4 mathes 40, A matches Aaa) + for my $keydef (@Global::keydefs) { + $keydef->{'partial_match'} = 1; + } + my $line; + while($line = <$fh> + and + not compare($line,@search_vals)) { + print $line; + } + close $fh; + for my $keydef (@Global::keydefs) { + $keydef->{'partial_match'} = 0; + } +} + +sub bsearch { + my $file = shift; + my @search_vals = @_; + my $min = 0; + my $max = -s $file; + my $fh; + if(not open ($fh, "<", $file)) { + error("Cannot open '$file'"); + exit 1; + } + my($line,$middle); + my $minnl = $min; + my $maxnl = $max; + while($max - $min > 1) { + $middle = int(($max + $min)/2); + seek($fh,$middle,0) or die("Cannot seek to $middle"); + if($middle > 0) { + # Read last half of a line + <$fh>; + } + my $newline_pos = tell($fh); + debug("$min <= $middle <= $newline_pos <= $max\n"); + debug("$minnl <= $newline_pos <= $maxnl\n"); + if($newline_pos == $maxnl + or + eof($fh) + or + compare(($line = <$fh>),@search_vals) >= 0) { + # We have see this newline position before + # or we are at the end of the file + # or we should search the upper half + $max = $middle; + $maxnl = $newline_pos; + } else { + # We should search the upper half + $min = $middle; + $minnl = $newline_pos; + } + } + seek($fh,$minnl,0) or die("Cannot seek to $minnl"); + $line = <$fh>; + if(compare($line,@search_vals) >= 0) { + if($opt::byte_offset) { + return $minnl."\n"; + } else { + return $line; + } + } else { + if($opt::byte_offset) { + return tell($fh)."\n"; + } else { + $line=<$fh>; + return $line; + } + } +} + +sub parse_keydef { + # parse keydef F[.C][OPTS][,F[.C][OPTS]] + my %defaultorder = ( + "b" => $opt::ignore_leading_blanks, + "d" => $opt::dictionary_order, + "f" => $opt::ignore_case, + "g" => $opt::general_numeric_sort, + "i" => $opt::ignore_nonprinting, + "M" => $opt::month_sort, + "h" => $opt::human_numeric_sort, + "n" => $opt::numeric_sort, + "N" => $opt::numascii, + "r" => $opt::reverse, + "R" => $opt::random_sort, + "V" => $opt::version_sort, + ); + my %ordertbl = ( + "b" => 'ignore_leading_blanks', + "d" => 'dictionary_order', + "f" => 'ignore_case', + "g" => 'general_numeric_sort', + "i" => 'ignore_nonprinting', + "M" => 'month_sort', + "h" => 'human_numeric_sort', + "n" => 'numeric_sort', + "N" => 'numascii', + "r" => 'reverse', + "R" => 'random_sort', + "V" => 'version_sort', + ); + + if(@opt::key) { + + } else { + # Convert -n -r to -k1rn + # with sep = undef + $Global::sep = undef; + my $opt; + $opt->{'field'} = 1; + $opt->{'char'} = 1; + for (keys %defaultorder) { + $opt->{$ordertbl{$_}} = $defaultorder{$_}; + } + push(@Global::keydefs,$opt); + } + + for my $keydefs (@opt::key) { + for my $keydef (split /,/, $keydefs) { + my $opt; + if($keydef =~ /^(\d+)(\.(\d+))?([bdfgiMhnNRrV]+)?$/) { + # parse keydef F[.C][OPTS][,F[.C][OPTS]] + $opt->{'field'} = $1; + $opt->{'char'} = $3 || 1; + for (keys %defaultorder) { + $opt->{$ordertbl{$_}} = $defaultorder{$_}; + } + for my $o (split //, $4) { + $opt->{$ordertbl{$o}} = 1; + } + } else { + error("Keydef $keydef does not match F[.C][OPTS]"); + exit(255); + } + push(@Global::keydefs,$opt); + } + } +} + +sub compare { + # One key to search for per search column + my($line,@search_vals) = @_; + chomp($line); + debug("Compare: $line <=> @search_vals "); + my @field; + if($Global::sep) { + # Split line + @field = split /$Global::sep/o, $line; + } else { + @field = ($line); + } + my @tmp_vals = @search_vals; + for my $keydef (@Global::keydefs) { + # keydef = F[.C][OPTS][,F[.C][OPTS]] + my $f = $keydef->{'field'}; + my $c = $keydef->{'char'}; + my $cmp = compare_single(substr($field[$f-1],$c-1),shift @tmp_vals,$keydef); + # They differ on this key + debug("== $cmp\n"); + if($cmp) { return $cmp; } + } + # No difference on any keydefs + return 0; +} + +sub compare_single { + # Compare two lines based on order options + my ($a,$b,$opt) = @_; + debug("$a <=> $b"); + debug(my_dump($opt),"\n"); + if($opt->{'random_sort'}) { + return rand() <=> rand(); + } + if($opt->{'ignore_leading_blanks'}) { + $a =~ s/^\s+//; + $b =~ s/^\s+//; + } + if($opt->{'ignore_case'}) { + $a = uc($a); + $b = uc($b); + } + if($opt->{'partial_match'}) { + # String 'foo' matches 'foobar' + $a = substr($a,0,length $b); + } + if($opt->{'reverse'}) { + ($a,$b) = ($b,$a); + } + if($opt->{'human_numeric_sort'}) { + return multiply_binary_prefix($a) <=> multiply_binary_prefix($b); + } + if($opt->{'month_sort'}) { + my %m; + my @mon = qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC); + @m{@mon}={1..12}; + return ($m{$a} || 0) <=> ($m{$b} || 0); + } + if($opt->{'numeric_sort'}) { + return $a <=> $b; + } elsif($opt->{'numascii'}) { + return $a <=> $b or $a cmp $b; + } else { + return $a cmp $b; + } +} + +sub multiply_binary_prefix(@) { + # Evalualte numbers with binary prefix + # Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80 + # ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80 + # K =2^10, M =2^20, G =2^30, T =2^40, P =2^50, E =2^70, Z =2^80, Y =2^80 + # k =10^3, m =10^6, g =10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24 + # 13G = 13*1024*1024*1024 = 13958643712 + # Input: + # $s = string with prefixes + # Returns: + # $value = int with prefixes multiplied + my @v = @_; + for(@v) { + # 1E3=1000, 1E-3=0.001 + s/e([+-]?\d+)/*10**$1/gi; + } + for(@v) { + defined $_ or next; + s/ki/*1024/gi; + s/mi/*1024*1024/gi; + s/gi/*1024*1024*1024/gi; + s/ti/*1024*1024*1024*1024/gi; + s/pi/*1024*1024*1024*1024*1024/gi; + s/ei/*1024*1024*1024*1024*1024*1024/gi; + s/zi/*1024*1024*1024*1024*1024*1024*1024/gi; + s/yi/*1024*1024*1024*1024*1024*1024*1024*1024/gi; + s/xi/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi; + + s/K/*1024/g; + s/M/*1024*1024/g; + s/G/*1024*1024*1024/g; + s/T/*1024*1024*1024*1024/g; + s/P/*1024*1024*1024*1024*1024/g; + s/E/*1024*1024*1024*1024*1024*1024/g; + s/Z/*1024*1024*1024*1024*1024*1024*1024/g; + s/Y/*1024*1024*1024*1024*1024*1024*1024*1024/g; + s/X/*1024*1024*1024*1024*1024*1024*1024*1024*1024/g; + + s/k/*1000/g; + s/m/*1000*1000/g; + s/g/*1000*1000*1000/g; + s/t/*1000*1000*1000*1000/g; + s/p/*1000*1000*1000*1000*1000/g; + s/e/*1000*1000*1000*1000*1000*1000/g; + s/z/*1000*1000*1000*1000*1000*1000*1000/g; + s/y/*1000*1000*1000*1000*1000*1000*1000*1000/g; + s/x/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g; + + $_ = eval $_; + } + return wantarray ? @v : $v[0]; +} + +sub status { + my @w = @_; + my $fh = $Global::status_fd || *STDERR; + print $fh map { ($_, "\n") } @w; + flush $fh; +} + +sub status_no_nl { + my @w = @_; + my $fh = $Global::status_fd || *STDERR; + print $fh @w; + flush $fh; +} + +sub warning { + my @w = @_; + my $prog = $Global::progname || "parallel"; + status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w); +} + +sub error { + my @w = @_; + my $prog = $Global::progname || "parallel"; + status(map { ($prog.": Error: ". $_); } @w); +} + +sub die_bug { + my $bugid = shift; + print STDERR + ("$Global::progname: This should not happen. You have found a bug.\n", + "Please submit a bug at https://gitlab.com/ole.tange/tangetools/-/issues\n", + "and include:\n", + "* The version number: $Global::version\n", + "* The bugid: $bugid\n", + "* The command line being run\n", + "* The files being read (put the files on a webserver if they are big)\n", + "\n", + "If you get the error on smaller/fewer files, please include those instead.\n"); + exit(255); +} + +sub version { + # Returns: N/A + print join("\n", + "$Global::progname $Global::version", + "Copyright (C) 2016-2020", + "Ole Tange and Free Software Foundation, Inc.", + "License GPLv3+: GNU GPL version 3 or later ", + "This is free software: you are free to change and redistribute it.", + "$Global::progname comes with no warranty.", + "", + "Web site: https://gitlab.com/ole.tange/tangetools/\n", + ); +} + +sub my_dump(@) { + # Returns: + # ascii expression of object if Data::Dump(er) is installed + # error code otherwise + my @dump_this = (@_); + eval "use Data::Dump qw(dump);"; + if ($@) { + # Data::Dump not installed + eval "use Data::Dumper;"; + if ($@) { + my $err = "Neither Data::Dump nor Data::Dumper is installed\n". + "Not dumping output\n"; + ::status($err); + return $err; + } else { + return Dumper(@dump_this); + } + } else { + # Create a dummy Data::Dump:dump as Hans Schou sometimes has + # it undefined + eval "sub Data::Dump:dump {}"; + eval "use Data::Dump qw(dump);"; + return (Data::Dump::dump(@dump_this)); + } +} + +sub debug(@) { + # Returns: N/A + $Global::debug or return; + print @_; +} diff --git a/2search/2search b/2search/2search new file mode 100755 index 0000000..ff5ca56 --- /dev/null +++ b/2search/2search @@ -0,0 +1,777 @@ +#!/usr/bin/perl + +=head1 NAME + +2search - binary search through sorted text files + +2grep - binary search+grep through sorted text files + +=head1 SYNOPSIS + +B<2search> [-nrfB] file string [string...] + +B<2search> --grep [-nrf] file string [string...] + +B<2grep> [-nrf] file string [string...] + +... | B<2search> [-nrfB] file + +... | B<2search> --grep [-nrf] file + +... | B<2grep> [-nrf] file + +=head1 DESCRIPTION + +B<2search> searches a sorted file for a string. It outputs the +following line or the byte position of this line, which is where the +string would have been if it had been in the sorted file. + +B<2grep> output all lines starting with a given string. The file must +be sorted. + +=over 9 + +=item B<--ignore-leading-blanks> + +=item B<-b> + +ignore leading blanks + + +=item B<--byte-offset> + +=item B<-B> + +print byte position where string would have been + + +=item B<--dictionary-order> (not implemented) + +=item B<-d> + +consider only blanks and alphanumeric characters + + +=item B<--debug> (not implemented) + +=item B<-D> + +annotate the part of the line used to sort, and warn about +questionable usage to stderr + + +=item B<--ignore-case> + +=item B<-f> + +fold lower case to upper case characters + + +=item B<--file> I + +=item B<-F> I + +search for all lines in I + + +=item B<--general-numeric-sort> (not implemented) + +=item B<-g> + +compare according to general numerical value + + +=item B<--ignore-nonprinting> (not implemented) + +=item B<-i> + +consider only printable characters + + +=item B<--month-sort> + +=item B<-M> + +compare (unknown) < 'JAN' < ... < 'DEC' + + +=item B<--human-numeric-sort> + +=item B<-h> + +compare human readable numbers (e.g., 2K 1G) + + +=item B<--key=KEYDEF> (not implemented) + +=item B<-k> + +sort via a key; KEYDEF gives location and type + + +=item B<--numeric-sort> + +=item B<-n> + +compare according to string numerical value. If numerical values are +the same: split the string into blocks of numbers and non-numbers, and +compare numbers as numbers and strings as strings. + +This will sort like this: chr3 chr11 3chr 11chr + + +=item B<--numascii> + +=item B<-N> + +compare according to string numerical value. If numerical values are +the same: compare as strings + + +=item B<--random-sort> + +=item B<-R> + +sort by random hash of keys + + +=item B<--reverse> + +=item B<-r> + +reverse the result of comparisons + + +=item B<--sort=WORD> (not implemented) + +sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month +B<-M>, numeric B<-n>, random B<-R>, version B<-V> + + +=item B<-t> + +=item B<--field-separator=SEP> + +use SEP instead of non-blank to blank transition + + +=item B<-z> + +=item B<--zero-terminated> + +end lines with 0 byte, not newline + +=back + +=head1 EXAMPLES + +=head2 Single key + +Input is sorted by Chromosome,Position: + + SampleID Position Chromosome + foo 10000123 chr3 + foo 10000125 chr3 + foo 9999998 chr11 + foo 10000124 chr11 + foo 10000126 chr11 + +To find all chr3: + + 2grep -n -k3 inputfile chr3 + +-n will split 'chr3' into 'chr' which is compared asciibetically and +'3' which is compared numerically. + +=head2 Not implemented + +To find all lines with chr3,10000125: + + 2grep -k3n,2n inputfile chr3 10000125 + + + +=head1 REPORTING BUGS + +B<2search> is part of tangetools. Report bugs to . + + +=head1 AUTHOR + +Copyright (C) 2016-2020 Ole Tange http://ole.tange.dk + + +=head1 LICENSE + +Copyright (C) 2013 Free Software Foundation, Inc. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +at your option any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +=head2 Documentation license I + +Permission is granted to copy, distribute and/or modify this documentation +under the terms of the GNU Free Documentation License, Version 1.3 or +any later version published by the Free Software Foundation; with no +Invariant Sections, with no Front-Cover Texts, and with no Back-Cover +Texts. A copy of the license is included in the file fdl.txt. + +=head2 Documentation license II + +You are free: + +=over 9 + +=item B + +to copy, distribute and transmit the work + +=item B + +to adapt the work + +=back + +Under the following conditions: + +=over 9 + +=item B + +You must attribute the work in the manner specified by the author or +licensor (but not in any way that suggests that they endorse you or +your use of the work). + +=item B + +If you alter, transform, or build upon this work, you may distribute +the resulting work only under the same, similar or a compatible +license. + +=back + +With the understanding that: + +=over 9 + +=item B + +Any of the above conditions can be waived if you get permission from +the copyright holder. + +=item B + +Where the work or any of its elements is in the public domain under +applicable law, that status is in no way affected by the license. + +=item B + +In no way are any of the following rights affected by the license: + +=over 9 + +=item * + +Your fair dealing or fair use rights, or other applicable +copyright exceptions and limitations; + +=item * + +The author's moral rights; + +=item * + +Rights other persons may have either in the work itself or in +how the work is used, such as publicity or privacy rights. + +=back + +=item B + +For any reuse or distribution, you must make clear to others the +license terms of this work. + +=back + +A copy of the full license is included in the file as cc-by-sa.txt. + +=head1 DEPENDENCIES + +B<2search>/B<2grep> uses Perl. + + +=head1 SEE ALSO + +B(1), B(1). + +=cut + +use strict; +use Getopt::Long; + +Getopt::Long::Configure("bundling","require_order"); + +GetOptions( + "debug|D" => \$opt::D, + "version" => \$opt::version, + "verbose|v" => \$opt::verbose, + "B|byte-offset" => \$opt::byte_offset, + "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks, + "d|dictionary-order" => \$opt::dictionary_order, + "f|ignore-case" => \$opt::ignore_case, + "g|general-numeric-sort" => \$opt::general_numeric_sort, + "G|grep" => \$opt::grep, + "F|file=s" => \$opt::file, + "i|ignore-nonprinting" => \$opt::ignore_nonprinting, + "M|month-sort" => \$opt::month_sort, + "h|human-numeric-sort" => \$opt::human_numeric_sort, + "n|numeric-sort" => \$opt::numeric_sort, + "N|numascii" => \$opt::numascii, + "r|reverse" => \$opt::reverse, + "R|random-sort" => \$opt::random_sort, + "sort=s" => \$opt::sort, + "V|version-sort" => \$opt::version_sort, + "k|key=s" => \@opt::key, + "t|field-separator=s" => \$opt::field_separator, + "z|zero-terminated" => \$opt::zero_terminated, + ); +$Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1]; +$Global::version = 20200328; +if($opt::version) { version(); exit 0; } +if($opt::zero_terminated) { $/ = "\0"; } +if(@opt::key) { + # Default separator if --key = whitespace + $Global::sep = '\s+'; + if(defined $opt::field_separator) { $Global::sep = $opt::field_separator; } +} +if($Global::progname eq "2grep") { $opt::grep = 1; } +$Global::debug = $opt::D; + +parse_keydef(); + +debug(my_dump(\@Global::keydefs),"\n"); + +my $file = shift; +if(@ARGV) { + $opt::argv = 1; +} elsif(defined $opt::file) { + # skip +} else { + $opt::stdin = 1; +} + + round: + while(1) { + my @search_vals; + for(@Global::keydefs) { + my $val = get(); + if(not defined $val) { + last round; + } + push @search_vals, $val; + } + if($opt::grep) { + bgrep($file,@search_vals); + } else { + print bsearch($file,@search_vals); + } +} + +{ + my $fh; + + sub get { + if($opt::argv) { + # Search for strings on the command line + return shift @ARGV; + } + if($opt::file) { + # Search for strings given with --file + if(not $fh) { + if(not open(my $fh, "<", $opt::file)) { + error("Cannot open $opt::file"); + exit(255); + } + } + my $val = <$fh>; + chomp $val; + return $val; + } + if($opt::stdin) { + # Search for strings on stdin + my $val = <>; + chomp $val; + return $val; + } + die; + } +} + +sub bgrep { + my $file = shift; + my @search_vals = @_; + $opt::byte_offset = 1; + my $startpos = bsearch($file,@search_vals); + my $fh; + if(not open ($fh, "<", $file)) { + error("Cannot open '$file'"); + exit 1; + } + seek($fh,$startpos,0) or die; + # Allow for partial matches in grep (4 mathes 40, A matches Aaa) + for my $keydef (@Global::keydefs) { + $keydef->{'partial_match'} = 1; + } + my $line; + while($line = <$fh> + and + not compare($line,@search_vals)) { + print $line; + } + close $fh; + for my $keydef (@Global::keydefs) { + $keydef->{'partial_match'} = 0; + } +} + +sub bsearch { + my $file = shift; + my @search_vals = @_; + my $min = 0; + my $max = -s $file; + my $fh; + if(not open ($fh, "<", $file)) { + error("Cannot open '$file'"); + exit 1; + } + my($line,$middle); + my $minnl = $min; + my $maxnl = $max; + while($max - $min > 1) { + $middle = int(($max + $min)/2); + seek($fh,$middle,0) or die("Cannot seek to $middle"); + if($middle > 0) { + # Read last half of a line + <$fh>; + } + my $newline_pos = tell($fh); + debug("$min <= $middle <= $newline_pos <= $max\n"); + debug("$minnl <= $newline_pos <= $maxnl\n"); + if($newline_pos == $maxnl + or + eof($fh) + or + compare(($line = <$fh>),@search_vals) >= 0) { + # We have see this newline position before + # or we are at the end of the file + # or we should search the upper half + $max = $middle; + $maxnl = $newline_pos; + } else { + # We should search the upper half + $min = $middle; + $minnl = $newline_pos; + } + } + seek($fh,$minnl,0) or die("Cannot seek to $minnl"); + $line = <$fh>; + if(compare($line,@search_vals) >= 0) { + if($opt::byte_offset) { + return $minnl."\n"; + } else { + return $line; + } + } else { + if($opt::byte_offset) { + return tell($fh)."\n"; + } else { + $line=<$fh>; + return $line; + } + } +} + +sub parse_keydef { + # parse keydef F[.C][OPTS][,F[.C][OPTS]] + my %defaultorder = ( + "b" => $opt::ignore_leading_blanks, + "d" => $opt::dictionary_order, + "f" => $opt::ignore_case, + "g" => $opt::general_numeric_sort, + "i" => $opt::ignore_nonprinting, + "M" => $opt::month_sort, + "h" => $opt::human_numeric_sort, + "n" => $opt::numeric_sort, + "N" => $opt::numascii, + "r" => $opt::reverse, + "R" => $opt::random_sort, + "V" => $opt::version_sort, + ); + my %ordertbl = ( + "b" => 'ignore_leading_blanks', + "d" => 'dictionary_order', + "f" => 'ignore_case', + "g" => 'general_numeric_sort', + "i" => 'ignore_nonprinting', + "M" => 'month_sort', + "h" => 'human_numeric_sort', + "n" => 'numeric_sort', + "N" => 'numascii', + "r" => 'reverse', + "R" => 'random_sort', + "V" => 'version_sort', + ); + + if(@opt::key) { + + } else { + # Convert -n -r to -k1rn + # with sep = undef + $Global::sep = undef; + my $opt; + $opt->{'field'} = 1; + $opt->{'char'} = 1; + for (keys %defaultorder) { + $opt->{$ordertbl{$_}} = $defaultorder{$_}; + } + push(@Global::keydefs,$opt); + } + + for my $keydefs (@opt::key) { + for my $keydef (split /,/, $keydefs) { + my $opt; + if($keydef =~ /^(\d+)(\.(\d+))?([bdfgiMhnNRrV]+)?$/) { + # parse keydef F[.C][OPTS][,F[.C][OPTS]] + $opt->{'field'} = $1; + $opt->{'char'} = $3 || 1; + for (keys %defaultorder) { + $opt->{$ordertbl{$_}} = $defaultorder{$_}; + } + for my $o (split //, $4) { + $opt->{$ordertbl{$o}} = 1; + } + } else { + error("Keydef $keydef does not match F[.C][OPTS]"); + exit(255); + } + push(@Global::keydefs,$opt); + } + } +} + +sub compare { + # One key to search for per search column + my($line,@search_vals) = @_; + chomp($line); + debug("Compare: $line <=> @search_vals "); + my @field; + if($Global::sep) { + # Split line + @field = split /$Global::sep/o, $line; + } else { + @field = ($line); + } + my @tmp_vals = @search_vals; + for my $keydef (@Global::keydefs) { + # keydef = F[.C][OPTS][,F[.C][OPTS]] + my $f = $keydef->{'field'}; + my $c = $keydef->{'char'}; + my $cmp = compare_single(substr($field[$f-1],$c-1),shift @tmp_vals,$keydef); + # They differ on this key + debug("== $cmp\n"); + if($cmp) { return $cmp; } + } + # No difference on any keydefs + return 0; +} + +sub compare_single { + # Compare two lines based on order options + my ($a,$b,$opt) = @_; + debug("$a <=> $b"); + debug(my_dump($opt),"\n"); + if($opt->{'random_sort'}) { + return rand() <=> rand(); + } + if($opt->{'ignore_leading_blanks'}) { + $a =~ s/^\s+//; + $b =~ s/^\s+//; + } + if($opt->{'ignore_case'}) { + $a = uc($a); + $b = uc($b); + } + if($opt->{'partial_match'}) { + # String 'foo' matches 'foobar' + $a = substr($a,0,length $b); + } + if($opt->{'reverse'}) { + ($a,$b) = ($b,$a); + } + if($opt->{'human_numeric_sort'}) { + return multiply_binary_prefix($a) <=> multiply_binary_prefix($b); + } + if($opt->{'month_sort'}) { + my %m; + my @mon = qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC); + @m{@mon}={1..12}; + return ($m{$a} || 0) <=> ($m{$b} || 0); + } + if($opt->{'numeric_sort'}) { + return $a <=> $b; + } elsif($opt->{'numascii'}) { + return $a <=> $b or $a cmp $b; + } else { + return $a cmp $b; + } +} + +sub multiply_binary_prefix(@) { + # Evalualte numbers with binary prefix + # Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80 + # ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80 + # K =2^10, M =2^20, G =2^30, T =2^40, P =2^50, E =2^70, Z =2^80, Y =2^80 + # k =10^3, m =10^6, g =10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24 + # 13G = 13*1024*1024*1024 = 13958643712 + # Input: + # $s = string with prefixes + # Returns: + # $value = int with prefixes multiplied + my @v = @_; + for(@v) { + # 1E3=1000, 1E-3=0.001 + s/e([+-]?\d+)/*10**$1/gi; + } + for(@v) { + defined $_ or next; + s/ki/*1024/gi; + s/mi/*1024*1024/gi; + s/gi/*1024*1024*1024/gi; + s/ti/*1024*1024*1024*1024/gi; + s/pi/*1024*1024*1024*1024*1024/gi; + s/ei/*1024*1024*1024*1024*1024*1024/gi; + s/zi/*1024*1024*1024*1024*1024*1024*1024/gi; + s/yi/*1024*1024*1024*1024*1024*1024*1024*1024/gi; + s/xi/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi; + + s/K/*1024/g; + s/M/*1024*1024/g; + s/G/*1024*1024*1024/g; + s/T/*1024*1024*1024*1024/g; + s/P/*1024*1024*1024*1024*1024/g; + s/E/*1024*1024*1024*1024*1024*1024/g; + s/Z/*1024*1024*1024*1024*1024*1024*1024/g; + s/Y/*1024*1024*1024*1024*1024*1024*1024*1024/g; + s/X/*1024*1024*1024*1024*1024*1024*1024*1024*1024/g; + + s/k/*1000/g; + s/m/*1000*1000/g; + s/g/*1000*1000*1000/g; + s/t/*1000*1000*1000*1000/g; + s/p/*1000*1000*1000*1000*1000/g; + s/e/*1000*1000*1000*1000*1000*1000/g; + s/z/*1000*1000*1000*1000*1000*1000*1000/g; + s/y/*1000*1000*1000*1000*1000*1000*1000*1000/g; + s/x/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g; + + $_ = eval $_; + } + return wantarray ? @v : $v[0]; +} + +sub status { + my @w = @_; + my $fh = $Global::status_fd || *STDERR; + print $fh map { ($_, "\n") } @w; + flush $fh; +} + +sub status_no_nl { + my @w = @_; + my $fh = $Global::status_fd || *STDERR; + print $fh @w; + flush $fh; +} + +sub warning { + my @w = @_; + my $prog = $Global::progname || "parallel"; + status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w); +} + +sub error { + my @w = @_; + my $prog = $Global::progname || "parallel"; + status(map { ($prog.": Error: ". $_); } @w); +} + +sub die_bug { + my $bugid = shift; + print STDERR + ("$Global::progname: This should not happen. You have found a bug.\n", + "Please submit a bug at https://gitlab.com/ole.tange/tangetools/-/issues\n", + "and include:\n", + "* The version number: $Global::version\n", + "* The bugid: $bugid\n", + "* The command line being run\n", + "* The files being read (put the files on a webserver if they are big)\n", + "\n", + "If you get the error on smaller/fewer files, please include those instead.\n"); + exit(255); +} + +sub version { + # Returns: N/A + print join("\n", + "$Global::progname $Global::version", + "Copyright (C) 2016-2020", + "Ole Tange and Free Software Foundation, Inc.", + "License GPLv3+: GNU GPL version 3 or later ", + "This is free software: you are free to change and redistribute it.", + "$Global::progname comes with no warranty.", + "", + "Web site: https://gitlab.com/ole.tange/tangetools/\n", + ); +} + +sub my_dump(@) { + # Returns: + # ascii expression of object if Data::Dump(er) is installed + # error code otherwise + my @dump_this = (@_); + eval "use Data::Dump qw(dump);"; + if ($@) { + # Data::Dump not installed + eval "use Data::Dumper;"; + if ($@) { + my $err = "Neither Data::Dump nor Data::Dumper is installed\n". + "Not dumping output\n"; + ::status($err); + return $err; + } else { + return Dumper(@dump_this); + } + } else { + # Create a dummy Data::Dump:dump as Hans Schou sometimes has + # it undefined + eval "sub Data::Dump:dump {}"; + eval "use Data::Dump qw(dump);"; + return (Data::Dump::dump(@dump_this)); + } +} + +sub debug(@) { + # Returns: N/A + $Global::debug or return; + print @_; +} diff --git a/2search/regressiontest b/2search/regressiontest new file mode 100755 index 0000000..f3b71a9 --- /dev/null +++ b/2search/regressiontest @@ -0,0 +1,194 @@ +#!/bin/bash + +test_tmp=`tempfile` +export test_tmp + +opt_tester() { + opt="$@" + tmp=$(tempfile) + test_2search() { + xargs echo Search in < $tmp + 2search $opt $tmp 0 2 2.1 100000 + 2search $opt -B $tmp 0 2 2.1 100000 + } + (true) | + sort $opt > $tmp + echo Search in null file + test_2search + + (echo) | + sort $opt > $tmp + echo Search in newline + test_2search + + (echo 1.000000000) | + sort $opt > $tmp + test_2search + + (echo 1.000000000; + echo 2) | + sort $opt > $tmp + test_2search + + (echo 1; + echo 2.000000000) | + sort $opt > $tmp + test_2search + + (echo 1.000000000; + echo 2; + echo 3) | + sort $opt > $tmp + test_2search + + (echo 1; + echo 2.000000000; + echo 3) | + sort $opt > $tmp + test_2search + + (echo 1; + echo 2; + echo 3.000000000) | + sort $opt > $tmp + test_2search + + rm $tmp +} +export -f opt_tester + +test_n() { + tmp=${test_tmp}_n + true > $tmp + echo Search in null file + 2search -n $tmp 0 2 2.1 100000 + 2search -nB $tmp 0 2 2.1 100000 + echo > $tmp + xargs echo Search in newline + 2search -n $tmp 0 2 2.1 100000 + 2search -nB $tmp 0 2 2.1 100000 + echo 1.000000000 > $tmp + xargs echo Search in < $tmp + 2search -n $tmp 0 2 2.1 100000 + 2search -nB $tmp 0 2 2.1 100000 + echo 1.000000000 > $tmp + echo 2 >> $tmp + xargs echo Search in < $tmp + 2search -n $tmp 0 2 2.1 100000 + 2search -nB $tmp 0 2 2.1 100000 + echo 1 > $tmp + echo 2.000000000 >> $tmp + xargs echo Search in < $tmp + 2search -n $tmp 0 2 2.1 100000 + 2search -nB $tmp 0 2 2.1 100000 + echo 1.000000000 > $tmp + echo 2 >> $tmp + echo 3 >> $tmp + xargs echo Search in < $tmp + 2search -n $tmp 0 2 2.1 100000 + 2search -nB $tmp 0 2 2.1 100000 + echo 1 > $tmp + echo 2.000000000 >> $tmp + echo 3 >> $tmp + xargs echo Search in < $tmp + 2search -n $tmp 0 2 2.1 100000 + 2search -nB $tmp 0 2 2.1 100000 + echo 1 > $tmp + echo 2 >> $tmp + echo 3.000000000 >> $tmp + xargs echo Search in < $tmp + 2search -n $tmp 0 2 2.1 100000 + 2search -nB $tmp 0 2 2.1 100000 + rm $tmp +} + +test_n_opt() { + opt_tester -n +} + +test_rn_opt() { + opt_tester -rn +} + +test_r_opt() { + opt_tester -rn +} + +test_k32_2n_1n() { + tmp=$(tempfile) + cat >$tmp < $tmp + echo '### 2search --grep' + 2search --grep $tmp 3 + echo '### 2grep' + 2grep $tmp 3 + echo '### ... | 2grep' + echo 3 | 2grep $tmp + rm $tmp +} + + +export -f $(compgen -A function | grep test_) +compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1' > regressiontest.new +diff regressiontest.new regressiontest.out diff --git a/2search/regressiontest.out b/2search/regressiontest.out new file mode 100644 index 0000000..54e1fe4 --- /dev/null +++ b/2search/regressiontest.out @@ -0,0 +1,280 @@ +test_k32_2n_1n 111 chr10 Sample 10 +test_k32_2n_1n 1111 chr10 Sample 10 +test_k32_2n_1n 11111 chr10 Sample 10 +test_k32_2n_1n 111111 chr10 Sample 10 +test_n Search in null file +test_n 0 +test_n 0 +test_n 0 +test_n 0 +test_n Search in newline +test_n +test_n 0 +test_n 1 +test_n 1 +test_n 1 +test_n Search in 1.000000000 +test_n 1.000000000 +test_n 0 +test_n 12 +test_n 12 +test_n 12 +test_n Search in 1.000000000 2 +test_n 1.000000000 +test_n 2 +test_n 0 +test_n 12 +test_n 14 +test_n 14 +test_n Search in 1 2.000000000 +test_n 1 +test_n 2.000000000 +test_n 0 +test_n 2 +test_n 14 +test_n 14 +test_n Search in 1.000000000 2 3 +test_n 1.000000000 +test_n 2 +test_n 3 +test_n 0 +test_n 12 +test_n 14 +test_n 16 +test_n Search in 1 2.000000000 3 +test_n 1 +test_n 2.000000000 +test_n 3 +test_n 0 +test_n 2 +test_n 14 +test_n 16 +test_n Search in 1 2 3.000000000 +test_n 1 +test_n 2 +test_n 3.000000000 +test_n 0 +test_n 2 +test_n 4 +test_n 16 +test_n_opt Search in null file +test_n_opt Search in +test_n_opt 0 +test_n_opt 0 +test_n_opt 0 +test_n_opt 0 +test_n_opt Search in newline +test_n_opt Search in +test_n_opt +test_n_opt 0 +test_n_opt 1 +test_n_opt 1 +test_n_opt 1 +test_n_opt Search in 1.000000000 +test_n_opt 1.000000000 +test_n_opt 0 +test_n_opt 12 +test_n_opt 12 +test_n_opt 12 +test_n_opt Search in 1.000000000 2 +test_n_opt 1.000000000 +test_n_opt 2 +test_n_opt 0 +test_n_opt 12 +test_n_opt 14 +test_n_opt 14 +test_n_opt Search in 1 2.000000000 +test_n_opt 1 +test_n_opt 2.000000000 +test_n_opt 0 +test_n_opt 2 +test_n_opt 14 +test_n_opt 14 +test_n_opt Search in 1.000000000 2 3 +test_n_opt 1.000000000 +test_n_opt 2 +test_n_opt 3 +test_n_opt 0 +test_n_opt 12 +test_n_opt 14 +test_n_opt 16 +test_n_opt Search in 1 2.000000000 3 +test_n_opt 1 +test_n_opt 2.000000000 +test_n_opt 3 +test_n_opt 0 +test_n_opt 2 +test_n_opt 14 +test_n_opt 16 +test_n_opt Search in 1 2 3.000000000 +test_n_opt 1 +test_n_opt 2 +test_n_opt 3.000000000 +test_n_opt 0 +test_n_opt 2 +test_n_opt 4 +test_n_opt 16 +test_partial_line ### 2search --grep +test_partial_line 3 +test_partial_line 30 +test_partial_line 31 +test_partial_line 32 +test_partial_line 33 +test_partial_line 34 +test_partial_line 35 +test_partial_line 36 +test_partial_line 37 +test_partial_line 38 +test_partial_line 39 +test_partial_line ### 2grep +test_partial_line 3 +test_partial_line 30 +test_partial_line 31 +test_partial_line 32 +test_partial_line 33 +test_partial_line 34 +test_partial_line 35 +test_partial_line 36 +test_partial_line 37 +test_partial_line 38 +test_partial_line 39 +test_partial_line ### ... | 2grep +test_partial_line 3 +test_partial_line 30 +test_partial_line 31 +test_partial_line 32 +test_partial_line 33 +test_partial_line 34 +test_partial_line 35 +test_partial_line 36 +test_partial_line 37 +test_partial_line 38 +test_partial_line 39 +test_rn_opt Search in null file +test_rn_opt Search in +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt Search in newline +test_rn_opt Search in +test_rn_opt +test_rn_opt +test_rn_opt +test_rn_opt +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt Search in 1.000000000 +test_rn_opt 1.000000000 +test_rn_opt 1.000000000 +test_rn_opt 1.000000000 +test_rn_opt 12 +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt Search in 2 1.000000000 +test_rn_opt 2 +test_rn_opt 2 +test_rn_opt 2 +test_rn_opt 14 +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt Search in 2.000000000 1 +test_rn_opt 2.000000000 +test_rn_opt 2.000000000 +test_rn_opt 2.000000000 +test_rn_opt 14 +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt 0 +test_rn_opt Search in 3 2 1.000000000 +test_rn_opt 2 +test_rn_opt 2 +test_rn_opt 3 +test_rn_opt 16 +test_rn_opt 2 +test_rn_opt 2 +test_rn_opt 0 +test_rn_opt Search in 3 2.000000000 1 +test_rn_opt 2.000000000 +test_rn_opt 2.000000000 +test_rn_opt 3 +test_rn_opt 16 +test_rn_opt 2 +test_rn_opt 2 +test_rn_opt 0 +test_rn_opt Search in 3.000000000 2 1 +test_rn_opt 2 +test_rn_opt 2 +test_rn_opt 3.000000000 +test_rn_opt 16 +test_rn_opt 12 +test_rn_opt 12 +test_rn_opt 0 +test_r_opt Search in null file +test_r_opt Search in +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt Search in newline +test_r_opt Search in +test_r_opt +test_r_opt +test_r_opt +test_r_opt +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt Search in 1.000000000 +test_r_opt 1.000000000 +test_r_opt 1.000000000 +test_r_opt 1.000000000 +test_r_opt 12 +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt Search in 2 1.000000000 +test_r_opt 2 +test_r_opt 2 +test_r_opt 2 +test_r_opt 14 +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt Search in 2.000000000 1 +test_r_opt 2.000000000 +test_r_opt 2.000000000 +test_r_opt 2.000000000 +test_r_opt 14 +test_r_opt 0 +test_r_opt 0 +test_r_opt 0 +test_r_opt Search in 3 2 1.000000000 +test_r_opt 2 +test_r_opt 2 +test_r_opt 3 +test_r_opt 16 +test_r_opt 2 +test_r_opt 2 +test_r_opt 0 +test_r_opt Search in 3 2.000000000 1 +test_r_opt 2.000000000 +test_r_opt 2.000000000 +test_r_opt 3 +test_r_opt 16 +test_r_opt 2 +test_r_opt 2 +test_r_opt 0 +test_r_opt Search in 3.000000000 2 1 +test_r_opt 2 +test_r_opt 2 +test_r_opt 3.000000000 +test_r_opt 16 +test_r_opt 12 +test_r_opt 12 +test_r_opt 0 diff --git a/Makefile b/Makefile index d21aea5..e204c30 100644 --- a/Makefile +++ b/Makefile @@ -1,21 +1,22 @@ -CMD = blink bsearch burncpu duplicate-packets em encdir field forever \ +CMD = blink 2grep 2search burncpu duplicate-packets em encdir field forever \ fxkill G gitnext gitundo goodpasswd histogram mtrr mirrorpdf \ neno off pdfman pidcmd plotpipe puniq ramusage rand rclean \ rina rn rrm seekmaniac shython sound-reload splitvideo stdout \ swapout T timestamp tracefile transpose upsidedown vid \ w4it-for-port-open whitehash wifi-reload wssh ytv yyyymmdd -all: blink/blink.1 bsearch/bsearch.1 burncpu/burncpu.1 \ - encdir/encdir.1 G/G.1 gitnext/gitnext.1 gitundo/gitundo.1 \ - goodpasswd/goodpasswd.1 histogram/histogram.1 \ - mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1 \ - pidcmd/pidcmd.1 plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 \ - rina/rina.1 rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 \ - shython/shython.1 sound-reload/sound-reload.1 \ - splitvideo/splitvideo.1 stdout/stdout.1 timestamp/timestamp.1 \ - tracefile/tracefile.1 transpose/transpose.1 T/T.1 \ - upsidedown/upsidedown.1 vid/vid.1 wifi-reload/wifi-reload.1 \ - wssh/wssh.1 ytv/ytv.1 yyyymmdd/yyyymmdd.1 +all: blink/blink.1 2search/2grep.1 2search/2search.1 \ + burncpu/burncpu.1 encdir/encdir.1 G/G.1 gitnext/gitnext.1 \ + gitundo/gitundo.1 goodpasswd/goodpasswd.1 \ + histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \ + off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 plotpipe/plotpipe.1 \ + puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 \ + seekmaniac/seekmaniac.1 shython/shython.1 \ + sound-reload/sound-reload.1 splitvideo/splitvideo.1 \ + stdout/stdout.1 timestamp/timestamp.1 tracefile/tracefile.1 \ + transpose/transpose.1 T/T.1 upsidedown/upsidedown.1 vid/vid.1 \ + wifi-reload/wifi-reload.1 wssh/wssh.1 ytv/ytv.1 \ + yyyymmdd/yyyymmdd.1 %.1: % pod2man $< > $@ diff --git a/bsearch/bsearch b/bsearch/bsearch deleted file mode 100755 index b7aff04..0000000 --- a/bsearch/bsearch +++ /dev/null @@ -1,404 +0,0 @@ -#!/usr/bin/perl - -=head1 NAME - -bsearch - binary search through sorted text files - -=head1 SYNOPSIS - -B [-nrfB] file string [string...] - -=head1 DESCRIPTION - -B searches a sorted file for a string. It outputs the -following line or the byte position of this line, which is where the -string would have been if it had been in the sorted file. - -=over 9 - -=item B<--ignore-leading-blanks> (not implemented) - -=item B<-b> - -ignore leading blanks - -=item B<--byte-offset> - -=item B<-B> - -print byte position where string would have been - -=item B<--dictionary-order> (not implemented) - -=item B<-d> - -consider only blanks and alphanumeric characters - -=item B<--debug> (not implemented) - -=item B<-D> - -annotate the part of the line used to sort, and warn about -questionable usage to stderr - -=item B<--ignore-case> - -=item B<-f> - -fold lower case to upper case characters - -=item B<--general-numeric-sort> (not implemented) - -=item B<-g> - -compare according to general numerical value - -=item B<--ignore-nonprinting> (not implemented) - -=item B<-i> - -consider only printable characters - -=item B<--month-sort> (not implemented) - -=item B<-M> - -compare (unknown) < 'JAN' < ... < 'DEC' - -=item B<--human-numeric-sort> (not implemented) - -=item B<-h> - -compare human readable numbers (e.g., 2K 1G) - -=item B<--key=KEYDEF> (not implemented) - -=item B<-k> - -sort via a key; KEYDEF gives location and type - -=item B<--numeric-sort> - -=item B<-n> - -compare according to string numerical value - -=item B<--random-sort> - -=item B<-R> - -sort by random hash of keys - -=item B<--reverse> - -=item B<-r> - -reverse the result of comparisons - -=item B<--sort=WORD> (not implemented) - -sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month -B<-M>, numeric B<-n>, random B<-R>, version B<-V> - -=item B<-t> (not implemented) - -=item B<--field-separator=SEP> - -use SEP instead of non-blank to blank transition - -=item B<-z> - -=item B<--zero-terminated> - -end lines with 0 byte, not newline - -=back - -=head1 EXAMPLES - -=head2 Missing - -Missing - - -=head1 REPORTING BUGS - -B is part of tangetools. Report bugs to . - - -=head1 AUTHOR - -Copyright (C) 2016 Ole Tange http://ole.tange.dk - - -=head1 LICENSE - -Copyright (C) 2013 Free Software Foundation, Inc. - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3 of the License, or -at your option any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . - -=head2 Documentation license I - -Permission is granted to copy, distribute and/or modify this documentation -under the terms of the GNU Free Documentation License, Version 1.3 or -any later version published by the Free Software Foundation; with no -Invariant Sections, with no Front-Cover Texts, and with no Back-Cover -Texts. A copy of the license is included in the file fdl.txt. - -=head2 Documentation license II - -You are free: - -=over 9 - -=item B - -to copy, distribute and transmit the work - -=item B - -to adapt the work - -=back - -Under the following conditions: - -=over 9 - -=item B - -You must attribute the work in the manner specified by the author or -licensor (but not in any way that suggests that they endorse you or -your use of the work). - -=item B - -If you alter, transform, or build upon this work, you may distribute -the resulting work only under the same, similar or a compatible -license. - -=back - -With the understanding that: - -=over 9 - -=item B - -Any of the above conditions can be waived if you get permission from -the copyright holder. - -=item B - -Where the work or any of its elements is in the public domain under -applicable law, that status is in no way affected by the license. - -=item B - -In no way are any of the following rights affected by the license: - -=over 9 - -=item * - -Your fair dealing or fair use rights, or other applicable -copyright exceptions and limitations; - -=item * - -The author's moral rights; - -=item * - -Rights other persons may have either in the work itself or in -how the work is used, such as publicity or privacy rights. - -=back - -=item B - -For any reuse or distribution, you must make clear to others the -license terms of this work. - -=back - -A copy of the full license is included in the file as cc-by-sa.txt. - -=head1 DEPENDENCIES - -B uses Perl. - - -=head1 SEE ALSO - -B(1), B(1). - -=cut - -use Getopt::Long; - -Getopt::Long::Configure("bundling","require_order"); - -GetOptions( - "debug|D=s" => \$opt::D, - "version" => \$opt::version, - "verbose|v" => \$opt::verbose, - "B|byte-offset" => \$opt::byte_offset, - "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks, - "d|dictionary-order" => \$opt::dictionary_order, - "f|ignore-case" => \$opt::ignore_case, - "g|general-numeric-sort" => \$opt::general_numeric_sort, - "i|ignore-nonprinting" => \$opt::ignore_nonprinting, - "M|month-sort" => \$opt::month_sort, - "h|human-numeric-sort" => \$opt::human_numeric_sort, - "n|numeric-sort" => \$opt::numeric_sort, - "r|reverse" => \$opt::reverse, - "R|random-sort" => \$opt::random_sort, - "sort=s" => \$opt::sort, - "V|version-sort" => \$opt::version_sort, - "k|key=s" => \@opt::key, - "t|field-separator=s" => \$opt::field_separator, - "z|zero-terminated" => \$opt::zero_terminated, - ); -$Global::progname = "bsearch"; -$Global::version = 20160712; -if($opt::version) { - version(); - exit 0; -} -if($opt::zero_terminated) { $/ = "\0"; } - -my $file = shift; - -for my $key (@ARGV) { - print bsearch($file,$key); -} - -sub bsearch { - my $file = shift; - my $key = shift; - my $min = 0; - my $max = -s $file; - - if(not open ($fh, "<", $file)) { - error("Cannot open '$file'"); - exit 1; - } - my $line; - while($max - $min > 1) { - $middle = int(($max + $min)/2); - seek($fh,$middle,0) or die; - my $half = <$fh>; - if(eof($fh) - or - compare(($line = <$fh>),$key) >= 0) { - $max = $middle; - } else { - $min = $middle; - } - } - seek($fh,$max,0) or die; - $line = <$fh>; - if(compare($line,$key) >= 0) { - if($opt::byte_offset) { - return "0\n"; - } else { - # The very first line - return ""; - } - } else { - if($opt::byte_offset) { - return tell($fh)."\n"; - } else { - return $line; - } - } -} - -sub compare { - my ($a,$b) = @_; - if($opt::random_sort) { - return rand() <=> rand(); - } - if($opt::reverse) { - ($a,$b) = ($b,$a); - } - if($opt::ignore_case) { - $a = uc($a); - $b = uc($b); - } - if($opt::numeric_sort) { - return $a <=> $b; - } elsif($opt::numascii) { - return $a <=> $b or $a cmp $b; - } else { - return $a cmp $b; - } -} - -sub status { - my @w = @_; - my $fh = $Global::status_fd || *STDERR; - print $fh map { ($_, "\n") } @w; - flush $fh; -} - -sub status_no_nl { - my @w = @_; - my $fh = $Global::status_fd || *STDERR; - print $fh @w; - flush $fh; -} - -sub warning { - my @w = @_; - my $prog = $Global::progname || "parallel"; - status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w); -} - -sub error { - my @w = @_; - my $prog = $Global::progname || "parallel"; - status(map { ($prog.": Error: ". $_); } @w); -} - -sub die_bug { - my $bugid = shift; - print STDERR - ("$Global::progname: This should not happen. You have found a bug.\n", - "Please contact and include:\n", - "* The version number: $Global::version\n", - "* The bugid: $bugid\n", - "* The command line being run\n", - "* The files being read (put the files on a webserver if they are big)\n", - "\n", - "If you get the error on smaller/fewer files, please include those instead.\n"); - ::wait_and_exit(255); -} - -sub version { - # Returns: N/A - print join("\n", - "GNU $Global::progname $Global::version", - "Copyright (C) 2016", - "Ole Tange and Free Software Foundation, Inc.", - "License GPLv3+: GNU GPL version 3 or later ", - "This is free software: you are free to change and redistribute it.", - "GNU $Global::progname comes with no warranty.", - "", - "Web site: http://www.gnu.org/software/${Global::progname}\n", - "When using programs that use GNU Parallel to process data for publication", - "please cite as described in 'parallel --citation'.\n", - ); -} diff --git a/bsearch/regressiontest b/bsearch/regressiontest deleted file mode 100755 index 6d77046..0000000 --- a/bsearch/regressiontest +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -test_tmp=`tempfile` -export test_tmp - -test_n() { - tmp=${test_tmp}_n - true > $tmp - bsearch -n $tmp 0 2 2.1 100000 - echo > $tmp - xargs < $tmp - bsearch -n $tmp 0 2 2.1 100000 - echo 1.000000000 > $tmp - xargs < $tmp - bsearch -n $tmp 0 2 2.1 100000 - echo 1.000000000 > $tmp - echo 2 >> $tmp - xargs < $tmp - bsearch -n $tmp 0 2 2.1 100000 - echo 1 > $tmp - echo 2.000000000 >> $tmp - xargs < $tmp - bsearch -n $tmp 0 2 2.1 100000 - echo 1.000000000 > $tmp - echo 2 >> $tmp - echo 3 >> $tmp - xargs < $tmp - bsearch -n $tmp 0 2 2.1 100000 - echo 1 > $tmp - echo 2.000000000 >> $tmp - echo 3 >> $tmp - xargs < $tmp - bsearch -n $tmp 0 2 2.1 100000 - echo 1 > $tmp - echo 2 >> $tmp - echo 3.000000000 >> $tmp - xargs < $tmp - bsearch -n $tmp 0 2 2.1 100000 - rm $tmp -} - - -export -f $(compgen -A function | grep test_) -compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1'