Merge branch 'master' of gitlab.com:ole.tange/tangetools

2016-08-13 20:41:59 +02:00 · 2016-08-13 20:41:59 +02:00 · ee13554589
parent 498e333232 1e87eb8f2c
commit ee13554589
8 changed files with 572 additions and 27 deletions
--- a/6
+++ b/6
@ -1,12 +1,12 @@
-CMD = blink histogram upsidedown tracefile timestamp rand rrm goodpasswd gitnext
+CMD = blink bsearch histogram upsidedown tracefile timestamp rand rrm goodpasswd gitnext

-all: blink/blink.1 goodpasswd/goodpasswd.1 histogram/histogram.1 rand/rand.1 rrm/rrm.1 timestamp/timestamp.1 tracefile/tracefile.1 upsidedown/upsidedown.1 wssh/wssh.1
+all: blink/blink.1 bsearch/bsearch.1 goodpasswd/goodpasswd.1 histogram/histogram.1 rand/rand.1 rrm/rrm.1 timestamp/timestamp.1 tracefile/tracefile.1 upsidedown/upsidedown.1 wssh/wssh.1

 %.1: %
 	pod2man $< > $@

 install:
 	mkdir -p /usr/local/bin
-	parallel eval ln -sf `pwd`/*/{} /usr/local/bin/{} ::: blink reniced em field forever neno rn stdout tracefile w4it-for-port-open upsidedown histogram goodpasswd mtrr not summer timestamp transpose wssh aptsearch rand rrm gitnext
+	parallel eval ln -sf `pwd`/*/{} /usr/local/bin/{} ::: blink bsearch reniced em field forever neno rn stdout tracefile w4it-for-port-open upsidedown histogram goodpasswd mtrr not summer timestamp transpose wssh aptsearch rand rrm gitnext
 	mkdir -p /usr/local/share/man/man1
 	parallel ln -sf `pwd`/{} /usr/local/share/man/man1/{/} ::: */*.1
--- a/2
+++ b/2
@ -2,6 +2,8 @@ Tools developed by Ole Tange <ole@tange.dk>.

 Probably not useful for you, but then again you never now.

+bsearch - binary search through sorted text files.
+
 em - Force emacs to run in terminal. Use xemacs if installed.

 field - Split on space. Give the given field number. Supports syntax 1-3,6-
--- a/bsearch/bsearch
+++ b/bsearch/bsearch
@ -0,0 +1,400 @@
+#!/usr/bin/perl
+
+=head1 NAME
+
+bsearch - binary search through sorted text files
+
+=head1 SYNOPSIS
+
+B<bsearch> [-nrfB] file string [string...]
+
+=head1 DESCRIPTION
+
+B<bsearch> searches a sorted file for a string. It outputs the
+following line or the byte position of this line, which is where the
+string would have been if it had been in the sorted file.
+
+=over 9
+
+=item B<--ignore-leading-blanks> (not implemented)
+
+=item B<-b>
+
+ignore leading blanks
+
+=item B<--byte-offset>
+
+=item B<-B>
+
+print byte position where string would have been
+
+=item B<--dictionary-order> (not implemented)
+
+=item B<-d>
+
+consider only blanks and alphanumeric characters
+
+=item B<--debug> (not implemented)
+
+=item B<-D>
+
+annotate the part of the line used to sort, and warn about
+questionable usage to stderr
+
+=item B<--ignore-case>
+
+=item B<-f>
+
+fold lower case to upper case characters
+
+=item B<--general-numeric-sort> (not implemented)
+
+=item B<-g>
+
+compare according to general numerical value
+
+=item B<--ignore-nonprinting> (not implemented)
+
+=item B<-i>
+
+consider only printable characters
+
+=item B<--month-sort> (not implemented)
+
+=item B<-M>
+
+compare (unknown) < 'JAN' < ... < 'DEC'
+
+=item B<--human-numeric-sort> (not implemented)
+
+=item B<-h>
+
+compare human readable numbers (e.g., 2K 1G)
+
+=item B<--key=KEYDEF> (not implemented)
+
+=item B<-k>
+
+sort via a key; KEYDEF gives location and type
+
+=item B<--numeric-sort>
+
+=item B<-n>
+
+compare according to string numerical value
+
+=item B<--random-sort> (not implemented)
+
+=item B<-R>
+
+sort by random hash of keys
+
+=item B<--reverse>
+
+=item B<-r>
+
+reverse the result of comparisons
+
+=item B<--sort=WORD> (not implemented)
+
+sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month
+B<-M>, numeric B<-n>, random B<-R>, version B<-V>
+
+=item B<-t> (not implemented)
+
+=item B<--field-separator=SEP>
+
+use SEP instead of non-blank to blank transition
+
+=item B<-z> (not implemented)
+
+=item B<--zero-terminated>
+
+end lines with 0 byte, not newline
+
+=back
+
+=head1 EXAMPLES
+
+=head2 Missing
+
+Missing
+
+
+=head1 REPORTING BUGS
+
+B<bsearch> is part of tangetools. Report bugs to <tools@tange.dk>.
+
+
+=head1 AUTHOR
+
+Copyright (C) 2016 Ole Tange http://ole.tange.dk
+
+
+=head1 LICENSE
+
+Copyright (C) 2013 Free Software Foundation, Inc.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+at your option any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+=head2 Documentation license I
+
+Permission is granted to copy, distribute and/or modify this documentation
+under the terms of the GNU Free Documentation License, Version 1.3 or
+any later version published by the Free Software Foundation; with no
+Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
+Texts.  A copy of the license is included in the file fdl.txt.
+
+=head2 Documentation license II
+
+You are free:
+
+=over 9
+
+=item B<to Share>
+
+to copy, distribute and transmit the work
+
+=item B<to Remix>
+
+to adapt the work
+
+=back
+
+Under the following conditions:
+
+=over 9
+
+=item B<Attribution>
+
+You must attribute the work in the manner specified by the author or
+licensor (but not in any way that suggests that they endorse you or
+your use of the work).
+
+=item B<Share Alike>
+
+If you alter, transform, or build upon this work, you may distribute
+the resulting work only under the same, similar or a compatible
+license.
+
+=back
+
+With the understanding that:
+
+=over 9
+
+=item B<Waiver>
+
+Any of the above conditions can be waived if you get permission from
+the copyright holder.
+
+=item B<Public Domain>
+
+Where the work or any of its elements is in the public domain under
+applicable law, that status is in no way affected by the license.
+
+=item B<Other Rights>
+
+In no way are any of the following rights affected by the license:
+
+=over 9
+
+=item *
+
+Your fair dealing or fair use rights, or other applicable
+copyright exceptions and limitations;
+
+=item *
+
+The author's moral rights;
+
+=item *
+
+Rights other persons may have either in the work itself or in
+how the work is used, such as publicity or privacy rights.
+
+=back
+
+=item B<Notice>
+
+For any reuse or distribution, you must make clear to others the
+license terms of this work.
+
+=back
+
+A copy of the full license is included in the file as cc-by-sa.txt.
+
+=head1 DEPENDENCIES
+
+B<bsearch> uses Perl.
+
+
+=head1 SEE ALSO
+
+B<grep>(1), B<sort>(1).
+
+=cut
+
+use Getopt::Long;
+
+Getopt::Long::Configure("bundling","require_order");
+
+GetOptions(
+    "debug|D=s" => \$opt::D,
+    "version" => \$opt::version,
+    "verbose|v" => \$opt::verbose,
+    "B|byte-offset" => \$opt::byte_offset,
+    "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
+    "d|dictionary-order" => \$opt::dictionary_order,
+    "f|ignore-case" => \$opt::ignore_case,
+    "g|general-numeric-sort" => \$opt::general_numeric_sort,
+    "i|ignore-nonprinting" => \$opt::ignore_nonprinting,
+    "M|month-sort" => \$opt::month_sort,
+    "h|human-numeric-sort" => \$opt::human_numeric_sort,
+    "n|numeric-sort" => \$opt::numeric_sort,
+    "r|reverse" => \$opt::reverse,
+    "sort=s" => \$opt::sort,
+    "V|version-sort" => \$opt::version_sort,
+    "k|key=s" => \@opt::key,
+    "t|field-separator=s" => \$opt::field_separator,
+    "z|zero-terminated" => \$opt::zero_terminated,
+    );
+$Global::progname = "bsearch";
+$Global::version = 20160712;
+if($opt::version) {
+    version();
+    exit 0;
+}
+if($opt::zero_terminated) { $/ = "\0"; }
+
+my $file = shift;
+
+for my $key (@ARGV) {
+    print bsearch($file,$key);
+}
+
+sub bsearch {
+    my $file = shift;
+    my $key = shift;
+    my $min = 0;
+    my $max = -s $file;
+
+    if(not open ($fh, "<", $file)) {
+	error("Cannot open '$file'");
+	exit 1;
+    }
+    my $line;
+    while($max - $min > 1) {
+	$middle = int(($max + $min)/2);
+	seek($fh,$middle,0) or die;
+	my $half = <$fh>;
+	if(eof($fh)
+	   or 
+	   compare(($line = <$fh>),$key) >= 0) {
+	    $max = $middle;
+	} else {
+	    $min = $middle;
+	}
+    }
+    seek($fh,$max,0) or die;
+    $line = <$fh>;
+    if(compare($line,$key) >= 0) {
+	if($opt::byte_offset) {
+	    return "0\n";
+	} else {
+	    # The very first line
+	    return "";
+	}
+    } else {
+	if($opt::byte_offset) {
+	    return tell($fh)."\n";
+	} else {
+	    return $line;
+	}
+    }
+}
+
+sub compare {
+    my ($a,$b) = @_;
+    if($opt::reverse) {
+	($a,$b) = ($b,$a);
+    }
+    if($opt::ignore_case) {
+	$a = uc($a);
+	$b = uc($b);
+    }
+    if($opt::numeric_sort) {
+	return $a <=> $b;
+    } elsif($opt::numascii) {
+	return $a <=> $b or $a cmp $b;
+    } else {
+	return $a cmp $b;
+    }
+}
+
+sub status {
+    my @w = @_;
+    my $fh = $Global::status_fd || *STDERR;
+    print $fh map { ($_, "\n") } @w;
+    flush $fh;
+}
+
+sub status_no_nl {
+    my @w = @_;
+    my $fh = $Global::status_fd || *STDERR;
+    print $fh @w;
+    flush $fh;
+}
+
+sub warning {
+    my @w = @_;
+    my $prog = $Global::progname || "parallel";
+    status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
+}
+
+sub error {
+    my @w = @_;
+    my $prog = $Global::progname || "parallel";
+    status(map { ($prog.": Error: ". $_); } @w);
+}
+
+sub die_bug {
+    my $bugid = shift;
+    print STDERR
+	("$Global::progname: This should not happen. You have found a bug.\n",
+	 "Please contact <parallel\@gnu.org> and include:\n",
+	 "* The version number: $Global::version\n",
+	 "* The bugid: $bugid\n",
+	 "* The command line being run\n",
+	 "* The files being read (put the files on a webserver if they are big)\n",
+	 "\n",
+	 "If you get the error on smaller/fewer files, please include those instead.\n");
+    ::wait_and_exit(255);
+}
+
+sub version {
+    # Returns: N/A
+    print join("\n",
+               "GNU $Global::progname $Global::version",
+               "Copyright (C) 2016",
+	       "Ole Tange and Free Software Foundation, Inc.",
+               "License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
+               "This is free software: you are free to change and redistribute it.",
+               "GNU $Global::progname comes with no warranty.",
+               "",
+               "Web site: http://www.gnu.org/software/${Global::progname}\n",
+	       "When using programs that use GNU Parallel to process data for publication",
+	       "please cite as described in 'parallel --citation'.\n",
+        );
+}
--- a/bsearch/regressiontest
+++ b/bsearch/regressiontest
@ -0,0 +1,44 @@
+#!/bin/bash
+
+test_tmp=`tempfile`
+export test_tmp
+
+test_n() {
+    tmp=${test_tmp}_n
+    true > $tmp
+    bsearch -n $tmp 0 2 2.1 100000
+    echo > $tmp
+    xargs < $tmp
+    bsearch -n $tmp 0 2 2.1 100000
+    echo 1.000000000 > $tmp
+    xargs < $tmp
+    bsearch -n $tmp 0 2 2.1 100000
+    echo 1.000000000 > $tmp
+    echo 2 >> $tmp
+    xargs < $tmp
+    bsearch -n $tmp 0 2 2.1 100000
+    echo 1 > $tmp
+    echo 2.000000000 >> $tmp
+    xargs < $tmp
+    bsearch -n $tmp 0 2 2.1 100000
+    echo 1.000000000 > $tmp
+    echo 2 >> $tmp
+    echo 3 >> $tmp
+    xargs < $tmp
+    bsearch -n $tmp 0 2 2.1 100000
+    echo 1 > $tmp
+    echo 2.000000000 >> $tmp
+    echo 3 >> $tmp
+    xargs < $tmp
+    bsearch -n $tmp 0 2 2.1 100000
+    echo 1 > $tmp
+    echo 2 >> $tmp
+    echo 3.000000000 >> $tmp
+    xargs < $tmp
+    bsearch -n $tmp 0 2 2.1 100000
+    rm $tmp
+}
+
+
+export -f $(compgen -A function | grep test_)
+compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1'
--- a/transpose/transpose
+++ b/transpose/transpose
@ -7,37 +7,43 @@ use File::Temp;
 my $delimiter = shift;
 my $buffer = shift;

+$delimiter ||= ",";
 # Use at most 1000M before flushing
-$buffer ||= 1000_000_000;
+$buffer ||= "1000M";
+$buffer = multiply_binary_prefix($buffer);
 # Perl makes the buffer baloon to 10 times the requested value
 $buffer /= 10;
+# max_col_size will be lowered after first line read.
 $max_col_size = $buffer;
 my $delimiter_regexp = $delimiter;
 $delimiter_regexp =~ s/(\W)/\\$1/g;
 my @current;
-my $last_t = 0;
+my $col_no_last_line = 0;
 my $lineno = 0;
 my %col;
 while(<>) {
    chomp;
-    # Split current line into columns
-    @current = split /$delimiter_regexp/o, $_;
-    my $t = 0;
-    map { 
-	push(@{$col{$t}},$_);
-	$col_size{$t} += length $_;
-	if($col_size{$t} > $max_col_size) {
-	    flush(\%col,$t);
-	    $col_size{$t} = 0;
+    my $col_no = 0;
+    my @to_be_flushed = ();
+    map {
+	push(@{$col{$col_no}},$_);
+	$col_size{$col_no} += length $_;
+	if($col_size{$col_no} > $max_col_size) {
+	    push @to_be_flushed, $col_no;
+	    $col_size{$col_no} = 0;
 	}
-	$t++;
-    } @current;
-    if($t != $last_t) {
-	if(0 == $last_t) {
-	    $last_t = $t;
-	    $max_col_size = $buffer/$last_t;
+	$col_no++;
+    } split /$delimiter_regexp/o, $_; # This should do de-csv'ing
+    if(@to_be_flushed) {
+	flush(\%col,@to_be_flushed);
+    }
+    if($col_no != $col_no_last_line) {
+	if(0 == $col_no_last_line) {
+	    # This is first time around
+	    $col_no_last_line = $col_no;
+	    $max_col_size = $buffer/$col_no_last_line;
 	} else {
-	    warning("Number of columns in line $NR: $t != $last_t\n");
+	    warning("Number of columns in line $NR: $col_no != $col_no_last_line\n");
 	}
    }
 }
@ -86,3 +92,34 @@ sub error {
    my @w = @_;
    print STDERR "transpose: Error: ", @w;
 }
+
+sub multiply_binary_prefix {
+    # Evalualte numbers with binary prefix
+    # k=10^3, m=10^6, g=10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24
+    # K=2^10, M=2^20, G=2^30, T=2^40, P=2^50, E=2^70, Z=2^80, Y=2^80
+    # Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80
+    # ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80
+    # 13G = 13*1024*1024*1024 = 13958643712
+    my $s = shift;
+    $s =~ s/k/*1000/g;
+    $s =~ s/M/*1000*1000/g;
+    $s =~ s/G/*1000*1000*1000/g;
+    $s =~ s/T/*1000*1000*1000*1000/g;
+    $s =~ s/P/*1000*1000*1000*1000*1000/g;
+    $s =~ s/E/*1000*1000*1000*1000*1000*1000/g;
+    $s =~ s/Z/*1000*1000*1000*1000*1000*1000*1000/g;
+    $s =~ s/Y/*1000*1000*1000*1000*1000*1000*1000*1000/g;
+    $s =~ s/X/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g;
+
+    $s =~ s/Ki?/*1024/gi;
+    $s =~ s/Mi?/*1024*1024/gi;
+    $s =~ s/Gi?/*1024*1024*1024/gi;
+    $s =~ s/Ti?/*1024*1024*1024*1024/gi;
+    $s =~ s/Pi?/*1024*1024*1024*1024*1024/gi;
+    $s =~ s/Ei?/*1024*1024*1024*1024*1024*1024/gi;
+    $s =~ s/Zi?/*1024*1024*1024*1024*1024*1024*1024/gi;
+    $s =~ s/Yi?/*1024*1024*1024*1024*1024*1024*1024*1024/gi;
+    $s =~ s/Xi?/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi;
+    $s = eval $s;
+    return $s;
+}
--- a/transpose/transpose-par.pl
+++ b/transpose/transpose-par.pl
@ -1,7 +1,5 @@
 #!/usr/bin/perl

-#!/usr/local/bin/parallel --shebang-wrap --pipe --block 10m -k --files /usr/bin/perl | xargs paste
-
 use Text::CSV;
 use File::Temp qw(tempfile tempdir);

@ -32,6 +30,65 @@ while(my $l = <>) {
 print map { join("\t",@$_),"\n" } @table;

 sub guess_csv_setting {
-    # Based on a single line guess the csv_setting
-    return { binary => 1 };
+    # Based on two lines guess the csv_setting
+    my $line = shift;
+    # Potential field separators
+    # Priority:
+    # \0 if both lines have the same number
+    # \t if both lines have the same number
+    my @fieldsep = (",", "\t", "\0", ":", ";", "|", "/");
+    my %count;
+    @count{@fieldsep} = (0,0,0,0,0,0);
+    # Count characters
+    map { $count{$_}++ } split //,$line;
+    my @sepsort = sort { $count{$b} <=> $count{$a} } @fieldsep;
+    my $guessed_sep;
+    if($count{"\0"} > 0) {
+	# \0 is in the line => this is definitely the field sep
+	$guessed_sep = "\0";
+    } elsif($count{"\t"} > 0) {
+	# \t is in the line => this is definitely the field sep
+	$guessed_sep = "\t";
+    } else {
+	$guessed_sep = $sepsort[0];
+    }
+    return { binary => 1, sep_char => $guessed_sep };
+}
+
+sub _guess_csv_setting {
+    # Try different csv_settings
+    # Return a $csv object with the best setting
+    my @csv_file_types = 
+	( { binary => 1, sep_char => "\0" },
+	  { binary => 1, sep_char => "\t" },
+	  { binary => 1, sep_char => "," },
+	  { binary => 1 },
+	);
+
+    my $succesful_csv_type;
+    my $csv;
+    for my $csv_file_type (@csv_file_types) {
+	$csv = Text::CSV->new ( $csv_file_type )
+	    or die "Cannot use CSV: ($csv_file_type) ".Text::CSV->error_diag ();
+	$succesful_csv_type = $csv_file_type;
+	my $last_n_fields;
+	for my $line (@lines) {
+	    if($csv->parse($line)) {
+		my $n_fields = ($csv->fields());
+		$last_fields ||= $n_fields;
+		
+	    } else{
+		$succesful_csv_type = 0;
+		last;
+	    }
+	}
+	
+    }
+    if(not $succesful_csv_type) {
+	$csv->error_diag();
+    }
+    
+    $csv = Text::CSV->new ( $succesful_csv_type )  # should set binary attribute.
+	or die "Cannot use CSV: ".Text::CSV->error_diag ();
+    return($csv);
 }
--- a/transpose/transpose-simple
+++ b/transpose/transpose-simple
@ -1,3 +1,9 @@
 Can it be done more simple?

 zcat D.gz | perl -ne 's/\s+/\n/g; open(OUT,">","out".(++$out)); print OUT' ; paste out* | pigz >Dt.gz
+
+Chop CSV into fields
+
+multi file paste
+
+paste out1 out2 | paste - out3
--- a/transpose/transposewrap.pl
+++ b/transpose/transposewrap.pl
@ -1,6 +1,5 @@
 #!/usr/bin/perl

-
 use File::Temp qw(tempfile tempdir);

 #$Global::debug = 1;
@ -8,7 +7,7 @@ my $block = "30m";
 debug("parallel --pipe --block $block -k --files -j150% transpose-par.pl\n");
 my @files = `parallel --pipe --block $block -k --files -j150% transpose-par.pl`;
 chomp(@files);
-my $tmp = File::Temp::tempdir(CLEANUP => 0);
+my $tmp = File::Temp::tempdir(CLEANUP => 1);
 my $fifo = "$tmp/0000000";
 my $cmd = "mkfifo $fifo; paste > $fifo ";
 my (@fifos, @args);