Merge branch 'master' of gitlab.com:ole.tange/tangetools

2016-08-13 20:41:59 +02:00 · 2016-08-13 20:41:59 +02:00 · ee13554589
parent 498e333232 1e87eb8f2c
commit ee13554589
8 changed files with 572 additions and 27 deletions
--- a/6
+++ b/6
@ -1,12 +1,12 @@
-CMD = blink histogram upsidedown tracefile timestamp rand rrm goodpasswd gitnext
+CMD = blink bsearch histogram upsidedown tracefile timestamp rand rrm goodpasswd gitnext
-all: blink/blink.1 goodpasswd/goodpasswd.1 histogram/histogram.1 rand/rand.1 rrm/rrm.1 timestamp/timestamp.1 tracefile/tracefile.1 upsidedown/upsidedown.1 wssh/wssh.1
+all: blink/blink.1 bsearch/bsearch.1 goodpasswd/goodpasswd.1 histogram/histogram.1 rand/rand.1 rrm/rrm.1 timestamp/timestamp.1 tracefile/tracefile.1 upsidedown/upsidedown.1 wssh/wssh.1
 %.1: %
 	pod2man $< > $@
 install:
 	mkdir -p /usr/local/bin
-	parallel eval ln -sf `pwd`/*/{} /usr/local/bin/{} ::: blink reniced em field forever neno rn stdout tracefile w4it-for-port-open upsidedown histogram goodpasswd mtrr not summer timestamp transpose wssh aptsearch rand rrm gitnext
+	parallel eval ln -sf `pwd`/*/{} /usr/local/bin/{} ::: blink bsearch reniced em field forever neno rn stdout tracefile w4it-for-port-open upsidedown histogram goodpasswd mtrr not summer timestamp transpose wssh aptsearch rand rrm gitnext
 	mkdir -p /usr/local/share/man/man1
 	parallel ln -sf `pwd`/{} /usr/local/share/man/man1/{/} ::: */*.1
--- a/2
+++ b/2
@ -2,6 +2,8 @@ Tools developed by Ole Tange <ole@tange.dk>.
 Probably not useful for you, but then again you never now.
 bsearch - binary search through sorted text files.
 em - Force emacs to run in terminal. Use xemacs if installed.
 field - Split on space. Give the given field number. Supports syntax 1-3,6-
--- a/bsearch/bsearch
+++ b/bsearch/bsearch
@ -0,0 +1,400 @@
 #!/usr/bin/perl
 =head1 NAME
 bsearch - binary search through sorted text files
 =head1 SYNOPSIS
 B<bsearch> [-nrfB] file string [string...]
 =head1 DESCRIPTION
 B<bsearch> searches a sorted file for a string. It outputs the
 following line or the byte position of this line, which is where the
 string would have been if it had been in the sorted file.
 =over 9
 =item B<--ignore-leading-blanks> (not implemented)
 =item B<-b>
 ignore leading blanks
 =item B<--byte-offset>
 =item B<-B>
 print byte position where string would have been
 =item B<--dictionary-order> (not implemented)
 =item B<-d>
 consider only blanks and alphanumeric characters
 =item B<--debug> (not implemented)
 =item B<-D>
 annotate the part of the line used to sort, and warn about
 questionable usage to stderr
 =item B<--ignore-case>
 =item B<-f>
 fold lower case to upper case characters
 =item B<--general-numeric-sort> (not implemented)
 =item B<-g>
 compare according to general numerical value
 =item B<--ignore-nonprinting> (not implemented)
 =item B<-i>
 consider only printable characters
 =item B<--month-sort> (not implemented)
 =item B<-M>
 compare (unknown) < 'JAN' < ... < 'DEC'
 =item B<--human-numeric-sort> (not implemented)
 =item B<-h>
 compare human readable numbers (e.g., 2K 1G)
 =item B<--key=KEYDEF> (not implemented)
 =item B<-k>
 sort via a key; KEYDEF gives location and type
 =item B<--numeric-sort>
 =item B<-n>
 compare according to string numerical value
 =item B<--random-sort> (not implemented)
 =item B<-R>
 sort by random hash of keys
 =item B<--reverse>
 =item B<-r>
 reverse the result of comparisons
 =item B<--sort=WORD> (not implemented)
 sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month
 B<-M>, numeric B<-n>, random B<-R>, version B<-V>
 =item B<-t> (not implemented)
 =item B<--field-separator=SEP>
 use SEP instead of non-blank to blank transition
 =item B<-z> (not implemented)
 =item B<--zero-terminated>
 end lines with 0 byte, not newline
 =back
 =head1 EXAMPLES
 =head2 Missing
 Missing
 =head1 REPORTING BUGS
 B<bsearch> is part of tangetools. Report bugs to <tools@tange.dk>.
 =head1 AUTHOR
 Copyright (C) 2016 Ole Tange http://ole.tange.dk
 =head1 LICENSE
 Copyright (C) 2013 Free Software Foundation, Inc.
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or
 at your option any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 =head2 Documentation license I
 Permission is granted to copy, distribute and/or modify this documentation
 under the terms of the GNU Free Documentation License, Version 1.3 or
 any later version published by the Free Software Foundation; with no
 Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
 Texts.  A copy of the license is included in the file fdl.txt.
 =head2 Documentation license II
 You are free:
 =over 9
 =item B<to Share>
 to copy, distribute and transmit the work
 =item B<to Remix>
 to adapt the work
 =back
 Under the following conditions:
 =over 9
 =item B<Attribution>
 You must attribute the work in the manner specified by the author or
 licensor (but not in any way that suggests that they endorse you or
 your use of the work).
 =item B<Share Alike>
 If you alter, transform, or build upon this work, you may distribute
 the resulting work only under the same, similar or a compatible
 license.
 =back
 With the understanding that:
 =over 9
 =item B<Waiver>
 Any of the above conditions can be waived if you get permission from
 the copyright holder.
 =item B<Public Domain>
 Where the work or any of its elements is in the public domain under
 applicable law, that status is in no way affected by the license.
 =item B<Other Rights>
 In no way are any of the following rights affected by the license:
 =over 9
 =item *
 Your fair dealing or fair use rights, or other applicable
 copyright exceptions and limitations;
 =item *
 The author's moral rights;
 =item *
 Rights other persons may have either in the work itself or in
 how the work is used, such as publicity or privacy rights.
 =back
 =item B<Notice>
 For any reuse or distribution, you must make clear to others the
 license terms of this work.
 =back
 A copy of the full license is included in the file as cc-by-sa.txt.
 =head1 DEPENDENCIES
 B<bsearch> uses Perl.
 =head1 SEE ALSO
 B<grep>(1), B<sort>(1).
 =cut
 use Getopt::Long;
 Getopt::Long::Configure("bundling","require_order");
 GetOptions(
    "debug|D=s" => \$opt::D,
    "version" => \$opt::version,
    "verbose|v" => \$opt::verbose,
    "B|byte-offset" => \$opt::byte_offset,
    "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
    "d|dictionary-order" => \$opt::dictionary_order,
    "f|ignore-case" => \$opt::ignore_case,
    "g|general-numeric-sort" => \$opt::general_numeric_sort,
    "i|ignore-nonprinting" => \$opt::ignore_nonprinting,
    "M|month-sort" => \$opt::month_sort,
    "h|human-numeric-sort" => \$opt::human_numeric_sort,
    "n|numeric-sort" => \$opt::numeric_sort,
    "r|reverse" => \$opt::reverse,
    "sort=s" => \$opt::sort,
    "V|version-sort" => \$opt::version_sort,
    "k|key=s" => \@opt::key,
    "t|field-separator=s" => \$opt::field_separator,
    "z|zero-terminated" => \$opt::zero_terminated,
    );
 $Global::progname = "bsearch";
 $Global::version = 20160712;
 if($opt::version) {
    version();
    exit 0;
 }
 if($opt::zero_terminated) { $/ = "\0"; }
 my $file = shift;
 for my $key (@ARGV) {
    print bsearch($file,$key);
 }
 sub bsearch {
    my $file = shift;
    my $key = shift;
    my $min = 0;
    my $max = -s $file;
    if(not open ($fh, "<", $file)) {
 	error("Cannot open '$file'");
 	exit 1;
    }
    my $line;
    while($max - $min > 1) {
 	$middle = int(($max + $min)/2);
 	seek($fh,$middle,0) or die;
 	my $half = <$fh>;
 	if(eof($fh)
 	   or 
 	   compare(($line = <$fh>),$key) >= 0) {
 	    $max = $middle;
 	} else {
 	    $min = $middle;
 	}
    }
    seek($fh,$max,0) or die;
    $line = <$fh>;
    if(compare($line,$key) >= 0) {
 	if($opt::byte_offset) {
 	    return "0\n";
 	} else {
 	    # The very first line
 	    return "";
 	}
    } else {
 	if($opt::byte_offset) {
 	    return tell($fh)."\n";
 	} else {
 	    return $line;
 	}
    }
 }
 sub compare {
    my ($a,$b) = @_;
    if($opt::reverse) {
 	($a,$b) = ($b,$a);
    }
    if($opt::ignore_case) {
 	$a = uc($a);
 	$b = uc($b);
    }
    if($opt::numeric_sort) {
 	return $a <=> $b;
    } elsif($opt::numascii) {
 	return $a <=> $b or $a cmp $b;
    } else {
 	return $a cmp $b;
    }
 }
 sub status {
    my @w = @_;
    my $fh = $Global::status_fd || *STDERR;
    print $fh map { ($_, "\n") } @w;
    flush $fh;
 }
 sub status_no_nl {
    my @w = @_;
    my $fh = $Global::status_fd || *STDERR;
    print $fh @w;
    flush $fh;
 }
 sub warning {
    my @w = @_;
    my $prog = $Global::progname || "parallel";
    status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
 }
 sub error {
    my @w = @_;
    my $prog = $Global::progname || "parallel";
    status(map { ($prog.": Error: ". $_); } @w);
 }
 sub die_bug {
    my $bugid = shift;
    print STDERR
 	("$Global::progname: This should not happen. You have found a bug.\n",
 	 "Please contact <parallel\@gnu.org> and include:\n",
 	 "* The version number: $Global::version\n",
 	 "* The bugid: $bugid\n",
 	 "* The command line being run\n",
 	 "* The files being read (put the files on a webserver if they are big)\n",
 	 "\n",
 	 "If you get the error on smaller/fewer files, please include those instead.\n");
    ::wait_and_exit(255);
 }
 sub version {
    # Returns: N/A
    print join("\n",
               "GNU $Global::progname $Global::version",
               "Copyright (C) 2016",
 	       "Ole Tange and Free Software Foundation, Inc.",
               "License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
               "This is free software: you are free to change and redistribute it.",
               "GNU $Global::progname comes with no warranty.",
               "",
               "Web site: http://www.gnu.org/software/${Global::progname}\n",
 	       "When using programs that use GNU Parallel to process data for publication",
 	       "please cite as described in 'parallel --citation'.\n",
        );
 }
--- a/bsearch/regressiontest
+++ b/bsearch/regressiontest
@ -0,0 +1,44 @@
 #!/bin/bash
 test_tmp=`tempfile`
 export test_tmp
 test_n() {
    tmp=${test_tmp}_n
    true > $tmp
    bsearch -n $tmp 0 2 2.1 100000
    echo > $tmp
    xargs < $tmp
    bsearch -n $tmp 0 2 2.1 100000
    echo 1.000000000 > $tmp
    xargs < $tmp
    bsearch -n $tmp 0 2 2.1 100000
    echo 1.000000000 > $tmp
    echo 2 >> $tmp
    xargs < $tmp
    bsearch -n $tmp 0 2 2.1 100000
    echo 1 > $tmp
    echo 2.000000000 >> $tmp
    xargs < $tmp
    bsearch -n $tmp 0 2 2.1 100000
    echo 1.000000000 > $tmp
    echo 2 >> $tmp
    echo 3 >> $tmp
    xargs < $tmp
    bsearch -n $tmp 0 2 2.1 100000
    echo 1 > $tmp
    echo 2.000000000 >> $tmp
    echo 3 >> $tmp
    xargs < $tmp
    bsearch -n $tmp 0 2 2.1 100000
    echo 1 > $tmp
    echo 2 >> $tmp
    echo 3.000000000 >> $tmp
    xargs < $tmp
    bsearch -n $tmp 0 2 2.1 100000
    rm $tmp
 }
 export -f $(compgen -A function | grep test_)
 compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1'
--- a/transpose/transpose
+++ b/transpose/transpose
@ -7,37 +7,43 @@ use File::Temp;
 my $delimiter = shift;
 my $buffer = shift;
 $delimiter ||= ",";
 # Use at most 1000M before flushing
-$buffer ||= 1000_000_000;
+$buffer ||= "1000M";
 $buffer = multiply_binary_prefix($buffer);
 # Perl makes the buffer baloon to 10 times the requested value
 $buffer /= 10;
 # max_col_size will be lowered after first line read.
 $max_col_size = $buffer;
 my $delimiter_regexp = $delimiter;
 $delimiter_regexp =~ s/(\W)/\\$1/g;
 my @current;
-my $last_t = 0;
+my $col_no_last_line = 0;
 my $lineno = 0;
 my %col;
 while(<>) {
    chomp;
-    # Split current line into columns
+    my $col_no = 0;
-    @current = split /$delimiter_regexp/o, $_;
+    my @to_be_flushed = ();
-    my $t = 0;
+    map {
-    map { 
+	push(@{$col{$col_no}},$_);
-	push(@{$col{$t}},$_);
+	$col_size{$col_no} += length $_;
-	$col_size{$t} += length $_;
+	if($col_size{$col_no} > $max_col_size) {
-	if($col_size{$t} > $max_col_size) {
+	    push @to_be_flushed, $col_no;
-	    flush(\%col,$t);
+	    $col_size{$col_no} = 0;
 	    $col_size{$t} = 0;
 	}
-	$t++;
+	$col_no++;
-    } @current;
+    } split /$delimiter_regexp/o, $_; # This should do de-csv'ing
-    if($t != $last_t) {
+    if(@to_be_flushed) {
-	if(0 == $last_t) {
+	flush(\%col,@to_be_flushed);
-	    $last_t = $t;
+    }
-	    $max_col_size = $buffer/$last_t;
+    if($col_no != $col_no_last_line) {
 	if(0 == $col_no_last_line) {
 	    # This is first time around
 	    $col_no_last_line = $col_no;
 	    $max_col_size = $buffer/$col_no_last_line;
 	} else {
-	    warning("Number of columns in line $NR: $t != $last_t\n");
+	    warning("Number of columns in line $NR: $col_no != $col_no_last_line\n");
 	}
    }
 }
@ -86,3 +92,34 @@ sub error {
    my @w = @_;
    print STDERR "transpose: Error: ", @w;
 }
 sub multiply_binary_prefix {
    # Evalualte numbers with binary prefix
    # k=10^3, m=10^6, g=10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24
    # K=2^10, M=2^20, G=2^30, T=2^40, P=2^50, E=2^70, Z=2^80, Y=2^80
    # Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80
    # ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80
    # 13G = 13*1024*1024*1024 = 13958643712
    my $s = shift;
    $s =~ s/k/*1000/g;
    $s =~ s/M/*1000*1000/g;
    $s =~ s/G/*1000*1000*1000/g;
    $s =~ s/T/*1000*1000*1000*1000/g;
    $s =~ s/P/*1000*1000*1000*1000*1000/g;
    $s =~ s/E/*1000*1000*1000*1000*1000*1000/g;
    $s =~ s/Z/*1000*1000*1000*1000*1000*1000*1000/g;
    $s =~ s/Y/*1000*1000*1000*1000*1000*1000*1000*1000/g;
    $s =~ s/X/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g;
    $s =~ s/Ki?/*1024/gi;
    $s =~ s/Mi?/*1024*1024/gi;
    $s =~ s/Gi?/*1024*1024*1024/gi;
    $s =~ s/Ti?/*1024*1024*1024*1024/gi;
    $s =~ s/Pi?/*1024*1024*1024*1024*1024/gi;
    $s =~ s/Ei?/*1024*1024*1024*1024*1024*1024/gi;
    $s =~ s/Zi?/*1024*1024*1024*1024*1024*1024*1024/gi;
    $s =~ s/Yi?/*1024*1024*1024*1024*1024*1024*1024*1024/gi;
    $s =~ s/Xi?/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi;
    $s = eval $s;
    return $s;
 }
--- a/transpose/transpose-par.pl
+++ b/transpose/transpose-par.pl
@ -1,7 +1,5 @@
 #!/usr/bin/perl
 #!/usr/local/bin/parallel --shebang-wrap --pipe --block 10m -k --files /usr/bin/perl | xargs paste
 use Text::CSV;
 use File::Temp qw(tempfile tempdir);
@ -32,6 +30,65 @@ while(my $l = <>) {
 print map { join("\t",@$_),"\n" } @table;
 sub guess_csv_setting {
-    # Based on a single line guess the csv_setting
+    # Based on two lines guess the csv_setting
-    return { binary => 1 };
+    my $line = shift;
    # Potential field separators
    # Priority:
    # \0 if both lines have the same number
    # \t if both lines have the same number
    my @fieldsep = (",", "\t", "\0", ":", ";", "|", "/");
    my %count;
    @count{@fieldsep} = (0,0,0,0,0,0);
    # Count characters
    map { $count{$_}++ } split //,$line;
    my @sepsort = sort { $count{$b} <=> $count{$a} } @fieldsep;
    my $guessed_sep;
    if($count{"\0"} > 0) {
 	# \0 is in the line => this is definitely the field sep
 	$guessed_sep = "\0";
    } elsif($count{"\t"} > 0) {
 	# \t is in the line => this is definitely the field sep
 	$guessed_sep = "\t";
    } else {
 	$guessed_sep = $sepsort[0];
    }
    return { binary => 1, sep_char => $guessed_sep };
 }
 sub _guess_csv_setting {
    # Try different csv_settings
    # Return a $csv object with the best setting
    my @csv_file_types = 
 	( { binary => 1, sep_char => "\0" },
 	  { binary => 1, sep_char => "\t" },
 	  { binary => 1, sep_char => "," },
 	  { binary => 1 },
 	);
    my $succesful_csv_type;
    my $csv;
    for my $csv_file_type (@csv_file_types) {
 	$csv = Text::CSV->new ( $csv_file_type )
 	    or die "Cannot use CSV: ($csv_file_type) ".Text::CSV->error_diag ();
 	$succesful_csv_type = $csv_file_type;
 	my $last_n_fields;
 	for my $line (@lines) {
 	    if($csv->parse($line)) {
 		my $n_fields = ($csv->fields());
 		$last_fields ||= $n_fields;
 	    } else{
 		$succesful_csv_type = 0;
 		last;
 	    }
 	}
    }
    if(not $succesful_csv_type) {
 	$csv->error_diag();
    }
    $csv = Text::CSV->new ( $succesful_csv_type )  # should set binary attribute.
 	or die "Cannot use CSV: ".Text::CSV->error_diag ();
    return($csv);
 }
--- a/transpose/transpose-simple
+++ b/transpose/transpose-simple
@ -1,3 +1,9 @@
 Can it be done more simple?
 zcat D.gz | perl -ne 's/\s+/\n/g; open(OUT,">","out".(++$out)); print OUT' ; paste out* | pigz >Dt.gz
 Chop CSV into fields
 multi file paste
 paste out1 out2 | paste - out3
--- a/transpose/transposewrap.pl
+++ b/transpose/transposewrap.pl
@ -1,6 +1,5 @@
 #!/usr/bin/perl
 use File::Temp qw(tempfile tempdir);
 #$Global::debug = 1;
@ -8,7 +7,7 @@ my $block = "30m";
 debug("parallel --pipe --block $block -k --files -j150% transpose-par.pl\n");
 my @files = `parallel --pipe --block $block -k --files -j150% transpose-par.pl`;
 chomp(@files);
-my $tmp = File::Temp::tempdir(CLEANUP => 0);
+my $tmp = File::Temp::tempdir(CLEANUP => 1);
 my $fifo = "$tmp/0000000";
 my $cmd = "mkfifo $fifo; paste > $fifo ";
 my (@fifos, @args);