945 lines
23 KiB
Perl
Executable file
945 lines
23 KiB
Perl
Executable file
#!/usr/bin/perl
|
|
|
|
=head1 NAME
|
|
|
|
2search - binary search through sorted text files
|
|
|
|
2grep - binary search+grep through sorted text files
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
B<2search> [-nrfHB] inputfile string [string...]
|
|
|
|
B<2search> --grep [-nrfH] inputfile string [string...]
|
|
|
|
B<2grep> [-nrfH] inputfile string [string...]
|
|
|
|
... | B<2search> [-nrfHB] inputfile
|
|
|
|
... | B<2search> --grep [-nrfH] inputfile
|
|
|
|
... | B<2grep> [-nrfH] inputfile
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
B<2search> searches a sorted file for lines starting with a string. It
|
|
outputs the following line or the byte position of this line, which is
|
|
where the string would have been if it had been in the sorted file.
|
|
|
|
B<2grep> output all lines starting with a given string. The file must
|
|
be sorted.
|
|
|
|
By using B<-k> the same way as in B<sort> you can instead search for
|
|
strings in columns, if the file is sorted using the B<-k> syntax.
|
|
|
|
|
|
=over 9
|
|
|
|
=item B<--ignore-leading-blanks>
|
|
|
|
=item B<-b>
|
|
|
|
Ignore leading blanks. Used if I<inputfile> is sorted with B<sort
|
|
--ignore-leading-blanks>.
|
|
|
|
|
|
=item B<--byte-offset>
|
|
|
|
=item B<-B>
|
|
|
|
Print byte position where string would have been.
|
|
|
|
|
|
=item B<--dictionary-order> (not implemented)
|
|
|
|
=item B<-d>
|
|
|
|
Consider only blanks and alphanumeric characters. Used if I<inputfile>
|
|
is sorted with B<sort --dictionary-order>.
|
|
|
|
|
|
=item B<--debug>
|
|
|
|
=item B<-D>
|
|
|
|
Show debugging information.
|
|
|
|
|
|
=item B<--ignore-case>
|
|
|
|
=item B<-f>
|
|
|
|
Fold lower case to upper case characters. Used if I<inputfile> is
|
|
sorted with B<sort --ignore-case>.
|
|
|
|
|
|
=item B<-F>
|
|
|
|
Fixed string. The search string 'foo' will not match 'foobar'.
|
|
|
|
|
|
=item B<--file> I<file>
|
|
|
|
Search for all lines in I<file>.
|
|
|
|
|
|
=item B<--general-numeric-sort> (not implemented)
|
|
|
|
=item B<-g>
|
|
|
|
Compare according to general numerical value. Used if I<inputfile> is
|
|
sorted with B<sort --general-numeric-sort>.
|
|
|
|
|
|
=item B<--header>
|
|
|
|
=item B<-H>
|
|
|
|
Treat the first line in I<file> as a header. Ignore it when searching
|
|
and print it once in the output.
|
|
|
|
|
|
=item B<--ignore-nonprinting> (not implemented)
|
|
|
|
=item B<-i>
|
|
|
|
Consider only printable characters. Used if I<inputfile> is
|
|
sorted with B<sort --ignore-nonprinting>.
|
|
|
|
|
|
=item B<--month-sort>
|
|
|
|
=item B<-M>
|
|
|
|
Compare (unknown) < 'JAN' < ... < 'DEC'. Used if I<inputfile> is
|
|
sorted with B<sort --month-sort>.
|
|
|
|
|
|
=item B<--human-numeric-sort>
|
|
|
|
=item B<-h>
|
|
|
|
Compare human readable numbers (e.g., 2K 1G). Used if I<inputfile> is
|
|
sorted with B<sort --human-numeric-sort>.
|
|
|
|
|
|
=item B<--key=KEYDEF>
|
|
|
|
=item B<-k>
|
|
|
|
Sort via a key; KEYDEF gives location and type. Used if I<inputfile>
|
|
is sorted with B<sort --key=KEYDEF>.
|
|
|
|
|
|
=item B<--numeric-sort>
|
|
|
|
=item B<-n>
|
|
|
|
Compare according to string numerical value. If numerical values are
|
|
the same: compare as strings. Used if I<inputfile> is sorted with
|
|
B<sort --numeric-sort>.
|
|
|
|
|
|
=item B<--numascii>
|
|
|
|
=item B<-N>
|
|
|
|
split the string into blocks of numbers and non-numbers. For each
|
|
block compare the block as numbers, if the numerical values are the
|
|
same: compare the block as strings.
|
|
|
|
This will sort like this: 3chr 11chr chr3 chr11
|
|
|
|
This is simiar to B<--version-sort>, but without the exceptions.
|
|
|
|
|
|
=item B<--random-sort>
|
|
|
|
=item B<-R>
|
|
|
|
Sort by random hash of keys.
|
|
|
|
|
|
=item B<--reverse>
|
|
|
|
=item B<-r>
|
|
|
|
Reverse the result of comparisons. Used if I<inputfile> is sorted with
|
|
B<sort --reverse>.
|
|
|
|
|
|
=item B<--sort=WORD> (not implemented)
|
|
|
|
Sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month
|
|
B<-M>, numeric B<-n>, random B<-R>, version B<-V>
|
|
|
|
|
|
=item B<-t>
|
|
|
|
=item B<--field-separator=SEP>
|
|
|
|
Use I<SEP> instead of blanks (\s+). I<SEP> is a regexp.
|
|
|
|
|
|
=item B<-z>
|
|
|
|
=item B<--zero-terminated>
|
|
|
|
End lines with 0 (NUL) byte, not newline.
|
|
|
|
|
|
=back
|
|
|
|
=head1 EXAMPLES
|
|
|
|
=head2 Single key
|
|
|
|
Given sorted I<input.txt> like:
|
|
|
|
A_number B_number Date Duration CellID
|
|
12893827 21034191 2020-03-21T13:38:13 P00:00:20 CPH382
|
|
12893827 80012345 2020-03-20T12:34:23 P00:00:20 CPH382
|
|
12893827 80012345 2020-03-20T12:45:03 P00:05:20 CPH382
|
|
22355591 47827750 2020-03-20T11:28:33 P00:32:27 ALB923
|
|
22355591 81382631 2020-03-21T21:28:33 P00:12:48 CPH382
|
|
22356142 45701514 2020-03-20T22:41:23 P00:02:48 CPH022
|
|
22356142 56818446 2020-03-21T08:38:34 P00:31:24 CPH645
|
|
|
|
To get all records with 22355591 you can run:
|
|
|
|
grep ^22355591 input.txt
|
|
|
|
But if I<input.txt> is several TB big, it can be very slow. B<2grep>
|
|
uses binary search which only works if the file is sorted, but takes
|
|
less than 1 second to run:
|
|
|
|
2grep -H input.txt 22355591
|
|
|
|
You can also search for a shorter string to get all records starting
|
|
with 2235:
|
|
|
|
2grep -H input.txt 2235
|
|
|
|
Or you can search for multiple search strings:
|
|
|
|
2grep -H input.txt 12893827 22356142
|
|
|
|
=head2 Multiple keys
|
|
|
|
Input is sorted by SampleID, Chromosome, Position (in that order):
|
|
|
|
SampleID Chromosome Position Data
|
|
PatientA chr3 10002123 CCGTCTAATGGCTTGATTGGTACACCATGACATTGA
|
|
PatientA chr3 10003125 TCCATCGTCGGCGAGAAGGTACCAGGTAA
|
|
PatientA chr11 9999998 AATTCACAGTATGGCTGACGGTGTCGTAGCTACACG
|
|
PatientA chr11 10001240 TCCAGAAGTTTGA
|
|
PatientA chr11 10001260 ATAACGAGAACTTACGTTTTAAAAGGCCTA
|
|
PatientB chr3 10000125 GTCTTCACTTTATAAATGGATGATAGCCTTCA
|
|
|
|
SampleID is sorted as text. Chromosome is sorted by text first and
|
|
numerically for the number. Position is sorted by number.
|
|
|
|
To find all chr3 for PatientA:
|
|
|
|
2grep -H -k1,2N inputfile PatientA chr3
|
|
|
|
-N will split 'chr3' into 'chr' which is compared asciibetically and
|
|
'3' which is compared numerically.
|
|
|
|
To find all chr3 for PatientA and all chr3 for PatientB:
|
|
|
|
2grep -H -k1,2N inputfile PatientA chr3 PatientB chr3
|
|
|
|
|
|
=head1 PERFORMANCE
|
|
|
|
Binary search requires seeks from the disk. But B<2search> is designed
|
|
so that multiple searches will reuse cached data. This means searches
|
|
will be faster the more you run.
|
|
|
|
You can improve the speed even more by sorting the input strings. This
|
|
will make it possible to reuse cached data more.
|
|
|
|
It can be even faster if you run multiple searches in parallel.
|
|
|
|
This is due to magnetic drives' elevator sorting of requests when
|
|
seeking and due to NVMe drives working faster with more queues in
|
|
parallel.
|
|
|
|
cat searchstrings | parallel -n50 -j10 2grep inputfile
|
|
|
|
|
|
=head1 BUGS
|
|
|
|
B<2search> does not respect your locale setting. It assumes the input
|
|
is sorted with LC_ALL=C. If it is not B<2search> may give the wrong
|
|
result.
|
|
|
|
To solve this sort the input with B<LC_ALL=C sort ...>.
|
|
|
|
|
|
=head1 REPORTING BUGS
|
|
|
|
B<2search> and B<2grep> are part of tangetools. Report bugs on
|
|
https://git.data.coop/tange/tangetools/issues
|
|
|
|
|
|
=head1 AUTHOR
|
|
|
|
Copyright (C) 2016-2020 Ole Tange http://ole.tange.dk
|
|
|
|
|
|
=head1 LICENSE
|
|
|
|
Copyright (C) 2013 Free Software Foundation, Inc.
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 3 of the License, or
|
|
at your option any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
=head2 Documentation license I
|
|
|
|
Permission is granted to copy, distribute and/or modify this documentation
|
|
under the terms of the GNU Free Documentation License, Version 1.3 or
|
|
any later version published by the Free Software Foundation; with no
|
|
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
|
|
Texts. A copy of the license is included in the file fdl.txt.
|
|
|
|
=head2 Documentation license II
|
|
|
|
You are free:
|
|
|
|
=over 9
|
|
|
|
=item B<to Share>
|
|
|
|
to copy, distribute and transmit the work
|
|
|
|
=item B<to Remix>
|
|
|
|
to adapt the work
|
|
|
|
=back
|
|
|
|
Under the following conditions:
|
|
|
|
=over 9
|
|
|
|
=item B<Attribution>
|
|
|
|
You must attribute the work in the manner specified by the author or
|
|
licensor (but not in any way that suggests that they endorse you or
|
|
your use of the work).
|
|
|
|
=item B<Share Alike>
|
|
|
|
If you alter, transform, or build upon this work, you may distribute
|
|
the resulting work only under the same, similar or a compatible
|
|
license.
|
|
|
|
=back
|
|
|
|
With the understanding that:
|
|
|
|
=over 9
|
|
|
|
=item B<Waiver>
|
|
|
|
Any of the above conditions can be waived if you get permission from
|
|
the copyright holder.
|
|
|
|
=item B<Public Domain>
|
|
|
|
Where the work or any of its elements is in the public domain under
|
|
applicable law, that status is in no way affected by the license.
|
|
|
|
=item B<Other Rights>
|
|
|
|
In no way are any of the following rights affected by the license:
|
|
|
|
=over 9
|
|
|
|
=item *
|
|
|
|
Your fair dealing or fair use rights, or other applicable
|
|
copyright exceptions and limitations;
|
|
|
|
=item *
|
|
|
|
The author's moral rights;
|
|
|
|
=item *
|
|
|
|
Rights other persons may have either in the work itself or in
|
|
how the work is used, such as publicity or privacy rights.
|
|
|
|
=back
|
|
|
|
=item B<Notice>
|
|
|
|
For any reuse or distribution, you must make clear to others the
|
|
license terms of this work.
|
|
|
|
=back
|
|
|
|
A copy of the full license is included in the file as cc-by-sa.txt.
|
|
|
|
=head1 DEPENDENCIES
|
|
|
|
B<2search>/B<2grep> uses Perl.
|
|
|
|
|
|
=head1 SEE ALSO
|
|
|
|
B<grep>(1), B<sort>(1).
|
|
|
|
=cut
|
|
|
|
use strict;
|
|
use Getopt::Long;
|
|
|
|
Getopt::Long::Configure("bundling","require_order");
|
|
|
|
GetOptions(
|
|
"debug|D" => \$opt::D,
|
|
"version" => \$opt::version,
|
|
"verbose|v" => \$opt::verbose,
|
|
"B|byte-offset" => \$opt::byte_offset,
|
|
"b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
|
|
"d|dictionary-order" => \$opt::dictionary_order,
|
|
"f|ignore-case" => \$opt::ignore_case,
|
|
"g|general-numeric-sort" => \$opt::general_numeric_sort,
|
|
"G|grep" => \$opt::grep,
|
|
"F|fixed-strings" => \$opt::fixed_strings,
|
|
"file=s" => \$opt::file,
|
|
"i|ignore-nonprinting" => \$opt::ignore_nonprinting,
|
|
"M|month-sort" => \$opt::month_sort,
|
|
"h|human-numeric-sort" => \$opt::human_numeric_sort,
|
|
"n|numeric-sort" => \$opt::numeric_sort,
|
|
"N|numascii" => \$opt::numascii,
|
|
"r|reverse" => \$opt::reverse,
|
|
"R|random-sort" => \$opt::random_sort,
|
|
"sort=s" => \$opt::sort,
|
|
"V|version-sort" => \$opt::version_sort,
|
|
"k|key=s" => \@opt::key,
|
|
"H|header" => \$opt::header,
|
|
"t|field-separator=s" => \$opt::field_separator,
|
|
"recend|record-end=s" => \$opt::record_end,
|
|
"recstart|record-start=s" => \$opt::record_start,
|
|
"z|zero-terminated" => \$opt::zero_terminated,
|
|
) || exit(255);
|
|
$Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
|
|
$Global::version = 20200328;
|
|
if($opt::version) { version(); exit 0; }
|
|
if($opt::zero_terminated) { $/ = "\0"; }
|
|
if(@opt::key) {
|
|
# Default separator if --key = whitespace
|
|
$Global::fieldsep = '\s+';
|
|
if(defined $opt::field_separator) { $Global::fieldsep = $opt::field_separator; }
|
|
}
|
|
if($Global::progname eq "2grep") { $opt::grep = 1; }
|
|
$Global::debug = $opt::D;
|
|
if(defined $opt::record_end or defined $opt::record_start) {
|
|
if(not defined $opt::record_end) { $opt::record_end = ""; }
|
|
if(not defined $opt::record_start) { $opt::record_start = ""; }
|
|
$/ = unquote_printf($opt::record_end).unquote_printf($opt::record_start);
|
|
} else {
|
|
# Default = \n
|
|
$opt::record_end = "\n";
|
|
$/ = $opt::record_end;
|
|
}
|
|
|
|
parse_keydef();
|
|
|
|
debug(my_dump(\@Global::keydefs),"\n");
|
|
|
|
my $file = shift;
|
|
if(@ARGV) {
|
|
$opt::argv = 1;
|
|
} elsif(defined $opt::file) {
|
|
# skip
|
|
} else {
|
|
$opt::stdin = 1;
|
|
}
|
|
|
|
$Global::headersize = 0;
|
|
if($opt::header) {
|
|
if(not open (my $fh, "<", $file)) {
|
|
error("Cannot open '$file'");
|
|
exit 1;
|
|
} else {
|
|
my $header = <$fh>;
|
|
$header =~ s/\Q$opt::record_start\E$//;
|
|
$Global::headersize = length $header;
|
|
print $header;
|
|
}
|
|
}
|
|
|
|
round:
|
|
while(1) {
|
|
my @search_vals;
|
|
for(@Global::keydefs) {
|
|
my $val = get();
|
|
if(not defined $val) {
|
|
last round;
|
|
}
|
|
push @search_vals, $val;
|
|
}
|
|
if($opt::grep) {
|
|
bgrep($file,@search_vals);
|
|
} else {
|
|
print bsearch($file,@search_vals);
|
|
}
|
|
}
|
|
|
|
{
|
|
my $fh;
|
|
|
|
sub get {
|
|
if($opt::argv) {
|
|
# Search for strings on the command line
|
|
return shift @ARGV;
|
|
}
|
|
if($opt::file) {
|
|
# Search for strings given with --file
|
|
if(not $fh) {
|
|
if(not open(my $fh, "<", $opt::file)) {
|
|
error("Cannot open $opt::file");
|
|
exit(255);
|
|
}
|
|
}
|
|
my $val = <$fh>;
|
|
chomp $val;
|
|
return $val;
|
|
}
|
|
if($opt::stdin) {
|
|
# Search for strings on stdin
|
|
my $val = <>;
|
|
chomp $val;
|
|
return $val;
|
|
}
|
|
die;
|
|
}
|
|
}
|
|
|
|
sub bgrep {
|
|
my $file = shift;
|
|
my @search_vals = @_;
|
|
$opt::byte_offset = 1;
|
|
my $startpos = bsearch($file,@search_vals);
|
|
my $fh;
|
|
if(not open ($fh, "<", $file)) {
|
|
error("Cannot open '$file'");
|
|
exit 1;
|
|
}
|
|
seek($fh,$startpos,0) or die;
|
|
if(not $opt::fixed_strings) {
|
|
# Allow for partial matches in grep (4 mathes 40, A matches Aaa)
|
|
for my $keydef (@Global::keydefs) {
|
|
$keydef->{'partial_match'} = 1;
|
|
}
|
|
}
|
|
my $line;
|
|
while($line = <$fh>
|
|
and
|
|
not compare($line,@search_vals)) {
|
|
print $line;
|
|
}
|
|
close $fh;
|
|
for my $keydef (@Global::keydefs) {
|
|
$keydef->{'partial_match'} = 0;
|
|
}
|
|
}
|
|
|
|
sub bsearch {
|
|
my $file = shift;
|
|
my @search_vals = @_;
|
|
my $min = $Global::headersize;
|
|
my $max = -s $file;
|
|
my $fh;
|
|
if(not open ($fh, "<", $file)) {
|
|
error("Cannot open '$file'");
|
|
exit 1;
|
|
}
|
|
my($line,$middle);
|
|
my $minnl = $min;
|
|
my $maxnl = $max;
|
|
while($max - $min > 1) {
|
|
$middle = int(($max + $min)/2);
|
|
seek($fh,$middle,0) or die("Cannot seek to $middle");
|
|
if($middle > 0) {
|
|
# Read last half of a line
|
|
<$fh>;
|
|
}
|
|
my $newline_pos = tell($fh);
|
|
debug("$min <= $middle <= $newline_pos <= $max\n");
|
|
debug("$minnl <= $newline_pos <= $maxnl\n");
|
|
if($newline_pos == $maxnl
|
|
or
|
|
eof($fh)
|
|
or
|
|
compare(($line = <$fh>),@search_vals) >= 0) {
|
|
# We have see this newline position before
|
|
# or we are at the end of the file
|
|
# or we should search the lower half
|
|
$max = $middle;
|
|
$maxnl = $newline_pos;
|
|
} else {
|
|
# We should search the upper half
|
|
$min = $middle;
|
|
$minnl = $newline_pos;
|
|
}
|
|
}
|
|
seek($fh,$minnl,0) or die("Cannot seek to $minnl");
|
|
$line = <$fh>;
|
|
my $len = length $opt::record_start;
|
|
my $retpos;
|
|
if(compare($line,@search_vals) >= 0) {
|
|
# Adjust for length of $recstart
|
|
$retpos = $minnl - $len;
|
|
} else {
|
|
$retpos = tell($fh) - $len;
|
|
}
|
|
$retpos = $retpos < 0 ? 0 : $retpos;
|
|
if($opt::byte_offset) {
|
|
return $retpos."\n";
|
|
} else {
|
|
seek($fh,$retpos,0) or die("Cannot seek to $minnl");
|
|
if(length $opt::record_end) {
|
|
# read record: A...BA
|
|
# Remove $opt::record_start if it is at the end
|
|
# (might not be only record)
|
|
$line = <$fh>;
|
|
$line =~ s/\Q$opt::record_start\E$//;
|
|
} else {
|
|
# --recend == ''
|
|
if(length $opt::record_start) {
|
|
# read record: A...A
|
|
# Remove $opt::record_start if it is at the end
|
|
# (might not be only record)
|
|
$line = <$fh>; # Read: A
|
|
$line .= <$fh>; # Read: ...A
|
|
$line =~ s/\Q$opt::record_start\E$//;
|
|
} else {
|
|
# Len recstart == Len recend = 0. Does this ever happen?
|
|
# read record.
|
|
# Remove $opt::record_start if it is there (might be only record)
|
|
$line = <$fh>;
|
|
$line =~ s/\Q$opt::record_start\E$//;
|
|
}
|
|
}
|
|
return $line;
|
|
}
|
|
}
|
|
|
|
sub parse_keydef {
|
|
# parse keydef F[.C][OPTS][,F[.C][OPTS]]
|
|
my %defaultorder = (
|
|
"b" => $opt::ignore_leading_blanks,
|
|
"d" => $opt::dictionary_order,
|
|
"f" => $opt::ignore_case,
|
|
"g" => $opt::general_numeric_sort,
|
|
"i" => $opt::ignore_nonprinting,
|
|
"M" => $opt::month_sort,
|
|
"h" => $opt::human_numeric_sort,
|
|
"n" => $opt::numeric_sort,
|
|
"N" => $opt::numascii,
|
|
"r" => $opt::reverse,
|
|
"R" => $opt::random_sort,
|
|
"V" => $opt::version_sort,
|
|
);
|
|
my %ordertbl = (
|
|
"b" => 'ignore_leading_blanks',
|
|
"d" => 'dictionary_order',
|
|
"f" => 'ignore_case',
|
|
"g" => 'general_numeric_sort',
|
|
"i" => 'ignore_nonprinting',
|
|
"M" => 'month_sort',
|
|
"h" => 'human_numeric_sort',
|
|
"n" => 'numeric_sort',
|
|
"N" => 'numascii',
|
|
"r" => 'reverse',
|
|
"R" => 'random_sort',
|
|
"V" => 'version_sort',
|
|
);
|
|
|
|
if(@opt::key) {
|
|
# skip
|
|
} else {
|
|
# Convert -n -r to -k1rn
|
|
# with sep = undef
|
|
$Global::fieldsep = undef;
|
|
my $opt;
|
|
$opt->{'field'} = 1;
|
|
$opt->{'char'} = 1;
|
|
for (keys %defaultorder) {
|
|
$opt->{$ordertbl{$_}} = $defaultorder{$_};
|
|
}
|
|
push(@Global::keydefs,$opt);
|
|
}
|
|
|
|
for my $keydefs (@opt::key) {
|
|
for my $keydef (split /,/, $keydefs) {
|
|
my $opt;
|
|
if($keydef =~ /^(\d+)(\.(\d+))?([bdfgiMhnNRrV]+)?$/) {
|
|
# parse keydef F[.C][OPTS][,F[.C][OPTS]]
|
|
$opt->{'field'} = $1;
|
|
$opt->{'char'} = $3 || 1;
|
|
for (keys %defaultorder) {
|
|
$opt->{$ordertbl{$_}} = $defaultorder{$_};
|
|
}
|
|
for my $o (split //, $4) {
|
|
$opt->{$ordertbl{$o}} = 1;
|
|
}
|
|
} else {
|
|
error("Keydef $keydef does not match F[.C][OPTS]");
|
|
exit(255);
|
|
}
|
|
push(@Global::keydefs,$opt);
|
|
}
|
|
}
|
|
}
|
|
|
|
sub compare {
|
|
# One key to search for per search column
|
|
my($line,@search_vals) = @_;
|
|
chomp($line);
|
|
debug("Compare: $line <=> @search_vals; ");
|
|
my @field;
|
|
if($Global::fieldsep) {
|
|
# Split line
|
|
@field = split /$Global::fieldsep/o, $line;
|
|
} else {
|
|
@field = ($line);
|
|
}
|
|
my @tmp_vals = @search_vals;
|
|
for my $keydef (@Global::keydefs) {
|
|
# keydef = F[.C][OPTS][,F[.C][OPTS]]
|
|
my $f = $keydef->{'field'};
|
|
my $c = $keydef->{'char'};
|
|
my $cmp = compare_single(substr($field[$f-1],$c-1),shift @tmp_vals,$keydef);
|
|
# They differ on this key
|
|
debug("== $cmp\n");
|
|
if($cmp) { return $cmp; }
|
|
}
|
|
# No difference on any keydefs
|
|
return 0;
|
|
}
|
|
|
|
sub compare_single {
|
|
# Compare two lines based on order options
|
|
my ($a,$b,$opt) = @_;
|
|
debug("$a <=> $b");
|
|
debug(my_dump($opt),"\n");
|
|
if($opt->{'random_sort'}) {
|
|
return rand() <=> rand();
|
|
}
|
|
if($opt->{'ignore_leading_blanks'}) {
|
|
$a =~ s/^\s+//;
|
|
$b =~ s/^\s+//;
|
|
}
|
|
if($opt->{'ignore_case'}) {
|
|
$a = uc($a);
|
|
$b = uc($b);
|
|
}
|
|
if($opt->{'partial_match'}) {
|
|
# String 'foo' matches 'foobar'
|
|
$a = substr($a,0,length $b);
|
|
}
|
|
if($opt->{'reverse'}) {
|
|
($a,$b) = ($b,$a);
|
|
}
|
|
if($opt->{'human_numeric_sort'}) {
|
|
return multiply_binary_prefix($a) <=> multiply_binary_prefix($b);
|
|
}
|
|
if($opt->{'month_sort'}) {
|
|
my %m;
|
|
my @mon = qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC);
|
|
@m{@mon}={1..12};
|
|
return ($m{$a} || 0) <=> ($m{$b} || 0);
|
|
}
|
|
if($opt->{'numeric_sort'}) {
|
|
return($a <=> $b or $a cmp $b);
|
|
} elsif($opt->{'numascii'}) {
|
|
# Split on digit boundary
|
|
my @a = split /(?<=\d)(?=\D)|(?<=\D)(?=\d)/i, $a;
|
|
my @b = split /(?<=\d)(?=\D)|(?<=\D)(?=\d)/i, $b;
|
|
my $c;
|
|
for(my $t = 0;
|
|
defined $a[$t] and defined $b[$t];
|
|
$t++) {
|
|
$c = ($a[$t] <=> $b[$t] or $a[$t] cmp $b[$t]);
|
|
$c and return $c;
|
|
}
|
|
# All parts match, maybe one is longer
|
|
return $#a <=> $#b;
|
|
} else {
|
|
return $a cmp $b;
|
|
}
|
|
}
|
|
|
|
sub multiply_binary_prefix(@) {
|
|
# Evalualte numbers with binary prefix
|
|
# Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80
|
|
# ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80
|
|
# K =2^10, M =2^20, G =2^30, T =2^40, P =2^50, E =2^70, Z =2^80, Y =2^80
|
|
# k =10^3, m =10^6, g =10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24
|
|
# 13G = 13*1024*1024*1024 = 13958643712
|
|
# Input:
|
|
# $s = string with prefixes
|
|
# Returns:
|
|
# $value = int with prefixes multiplied
|
|
my @v = @_;
|
|
for(@v) {
|
|
# 1E3=1000, 1E-3=0.001
|
|
s/e([+-]?\d+)/*10**$1/gi;
|
|
}
|
|
for(@v) {
|
|
defined $_ or next;
|
|
s/ki/*1024/gi;
|
|
s/mi/*1024*1024/gi;
|
|
s/gi/*1024*1024*1024/gi;
|
|
s/ti/*1024*1024*1024*1024/gi;
|
|
s/pi/*1024*1024*1024*1024*1024/gi;
|
|
s/ei/*1024*1024*1024*1024*1024*1024/gi;
|
|
s/zi/*1024*1024*1024*1024*1024*1024*1024/gi;
|
|
s/yi/*1024*1024*1024*1024*1024*1024*1024*1024/gi;
|
|
s/xi/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi;
|
|
|
|
s/K/*1024/g;
|
|
s/M/*1024*1024/g;
|
|
s/G/*1024*1024*1024/g;
|
|
s/T/*1024*1024*1024*1024/g;
|
|
s/P/*1024*1024*1024*1024*1024/g;
|
|
s/E/*1024*1024*1024*1024*1024*1024/g;
|
|
s/Z/*1024*1024*1024*1024*1024*1024*1024/g;
|
|
s/Y/*1024*1024*1024*1024*1024*1024*1024*1024/g;
|
|
s/X/*1024*1024*1024*1024*1024*1024*1024*1024*1024/g;
|
|
|
|
s/k/*1000/g;
|
|
s/m/*1000*1000/g;
|
|
s/g/*1000*1000*1000/g;
|
|
s/t/*1000*1000*1000*1000/g;
|
|
s/p/*1000*1000*1000*1000*1000/g;
|
|
s/e/*1000*1000*1000*1000*1000*1000/g;
|
|
s/z/*1000*1000*1000*1000*1000*1000*1000/g;
|
|
s/y/*1000*1000*1000*1000*1000*1000*1000*1000/g;
|
|
s/x/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g;
|
|
|
|
$_ = eval $_;
|
|
}
|
|
return wantarray ? @v : $v[0];
|
|
}
|
|
|
|
sub status {
|
|
my @w = @_;
|
|
my $fh = $Global::status_fd || *STDERR;
|
|
print $fh map { ($_, "\n") } @w;
|
|
flush $fh;
|
|
}
|
|
|
|
sub status_no_nl {
|
|
my @w = @_;
|
|
my $fh = $Global::status_fd || *STDERR;
|
|
print $fh @w;
|
|
flush $fh;
|
|
}
|
|
|
|
sub warning {
|
|
my @w = @_;
|
|
my $prog = $Global::progname || "parallel";
|
|
status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
|
|
}
|
|
|
|
sub error {
|
|
my @w = @_;
|
|
my $prog = $Global::progname || "parallel";
|
|
status(map { ($prog.": Error: ". $_); } @w);
|
|
}
|
|
|
|
sub die_bug {
|
|
my $bugid = shift;
|
|
print STDERR
|
|
("$Global::progname: This should not happen. You have found a bug.\n",
|
|
"Please submit a bug at https://git.data.coop/tange/tangetools/issues\n",
|
|
"and include:\n",
|
|
"* The version number: $Global::version\n",
|
|
"* The bugid: $bugid\n",
|
|
"* The command line being run\n",
|
|
"* The files being read (put the files on a webserver if they are big)\n",
|
|
"\n",
|
|
"If you get the error on smaller/fewer files, please include those instead.\n");
|
|
exit(255);
|
|
}
|
|
|
|
sub version {
|
|
# Returns: N/A
|
|
print join("\n",
|
|
"$Global::progname $Global::version",
|
|
"Copyright (C) 2016-2020",
|
|
"Ole Tange and Free Software Foundation, Inc.",
|
|
"License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
|
|
"This is free software: you are free to change and redistribute it.",
|
|
"$Global::progname comes with no warranty.",
|
|
"",
|
|
"Web site: https://git.data.coop/tange/tangetools/\n",
|
|
);
|
|
}
|
|
|
|
sub my_dump(@) {
|
|
# Returns:
|
|
# ascii expression of object if Data::Dump(er) is installed
|
|
# error code otherwise
|
|
my @dump_this = (@_);
|
|
eval "use Data::Dump qw(dump);";
|
|
if ($@) {
|
|
# Data::Dump not installed
|
|
eval "use Data::Dumper;";
|
|
if ($@) {
|
|
my $err = "Neither Data::Dump nor Data::Dumper is installed\n".
|
|
"Not dumping output\n";
|
|
::status($err);
|
|
return $err;
|
|
} else {
|
|
return Dumper(@dump_this);
|
|
}
|
|
} else {
|
|
# Create a dummy Data::Dump:dump as Hans Schou sometimes has
|
|
# it undefined
|
|
eval "sub Data::Dump:dump {}";
|
|
eval "use Data::Dump qw(dump);";
|
|
return (Data::Dump::dump(@dump_this));
|
|
}
|
|
}
|
|
|
|
sub debug(@) {
|
|
# Returns: N/A
|
|
$Global::debug or return;
|
|
print @_;
|
|
}
|
|
|
|
sub unquote_printf() {
|
|
# Convert \t \n \r \000 \0
|
|
# Inputs:
|
|
# $string = string with \t \n \r \num \0
|
|
# Returns:
|
|
# $replaced = string with TAB NEWLINE CR <ascii-num> NUL
|
|
$_ = shift;
|
|
s/\\t/\t/g;
|
|
s/\\n/\n/g;
|
|
s/\\r/\r/g;
|
|
s/\\(\d\d\d)/eval 'sprintf "\\'.$1.'"'/ge;
|
|
s/\\(\d)/eval 'sprintf "\\'.$1.'"'/ge;
|
|
return $_;
|
|
}
|
|
|