diff --git a/Makefile b/Makefile index b39a05a..036bb69 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,12 @@ -CMD = blink histogram upsidedown tracefile timestamp rand rrm goodpasswd gitnext +CMD = blink bsearch histogram upsidedown tracefile timestamp rand rrm goodpasswd gitnext -all: blink/blink.1 goodpasswd/goodpasswd.1 histogram/histogram.1 rand/rand.1 rrm/rrm.1 timestamp/timestamp.1 tracefile/tracefile.1 upsidedown/upsidedown.1 wssh/wssh.1 +all: blink/blink.1 bsearch/bsearch.1 goodpasswd/goodpasswd.1 histogram/histogram.1 rand/rand.1 rrm/rrm.1 timestamp/timestamp.1 tracefile/tracefile.1 upsidedown/upsidedown.1 wssh/wssh.1 %.1: % pod2man $< > $@ install: mkdir -p /usr/local/bin - parallel eval ln -sf `pwd`/*/{} /usr/local/bin/{} ::: blink reniced em field forever neno rn stdout tracefile w4it-for-port-open upsidedown histogram goodpasswd mtrr not summer timestamp transpose wssh aptsearch rand rrm gitnext + parallel eval ln -sf `pwd`/*/{} /usr/local/bin/{} ::: blink bsearch reniced em field forever neno rn stdout tracefile w4it-for-port-open upsidedown histogram goodpasswd mtrr not summer timestamp transpose wssh aptsearch rand rrm gitnext mkdir -p /usr/local/share/man/man1 parallel ln -sf `pwd`/{} /usr/local/share/man/man1/{/} ::: */*.1 diff --git a/README b/README index 56807aa..c97c97b 100644 --- a/README +++ b/README @@ -2,6 +2,8 @@ Tools developed by Ole Tange . Probably not useful for you, but then again you never now. +bsearch - binary search through sorted text files. + em - Force emacs to run in terminal. Use xemacs if installed. field - Split on space. Give the given field number. Supports syntax 1-3,6- diff --git a/bsearch/bsearch b/bsearch/bsearch new file mode 100755 index 0000000..1ce4694 --- /dev/null +++ b/bsearch/bsearch @@ -0,0 +1,400 @@ +#!/usr/bin/perl + +=head1 NAME + +bsearch - binary search through sorted text files + +=head1 SYNOPSIS + +B [-nrfB] file string [string...] + +=head1 DESCRIPTION + +B searches a sorted file for a string. It outputs the +following line or the byte position of this line, which is where the +string would have been if it had been in the sorted file. + +=over 9 + +=item B<--ignore-leading-blanks> (not implemented) + +=item B<-b> + +ignore leading blanks + +=item B<--byte-offset> + +=item B<-B> + +print byte position where string would have been + +=item B<--dictionary-order> (not implemented) + +=item B<-d> + +consider only blanks and alphanumeric characters + +=item B<--debug> (not implemented) + +=item B<-D> + +annotate the part of the line used to sort, and warn about +questionable usage to stderr + +=item B<--ignore-case> + +=item B<-f> + +fold lower case to upper case characters + +=item B<--general-numeric-sort> (not implemented) + +=item B<-g> + +compare according to general numerical value + +=item B<--ignore-nonprinting> (not implemented) + +=item B<-i> + +consider only printable characters + +=item B<--month-sort> (not implemented) + +=item B<-M> + +compare (unknown) < 'JAN' < ... < 'DEC' + +=item B<--human-numeric-sort> (not implemented) + +=item B<-h> + +compare human readable numbers (e.g., 2K 1G) + +=item B<--key=KEYDEF> (not implemented) + +=item B<-k> + +sort via a key; KEYDEF gives location and type + +=item B<--numeric-sort> + +=item B<-n> + +compare according to string numerical value + +=item B<--random-sort> (not implemented) + +=item B<-R> + +sort by random hash of keys + +=item B<--reverse> + +=item B<-r> + +reverse the result of comparisons + +=item B<--sort=WORD> (not implemented) + +sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month +B<-M>, numeric B<-n>, random B<-R>, version B<-V> + +=item B<-t> (not implemented) + +=item B<--field-separator=SEP> + +use SEP instead of non-blank to blank transition + +=item B<-z> (not implemented) + +=item B<--zero-terminated> + +end lines with 0 byte, not newline + +=back + +=head1 EXAMPLES + +=head2 Missing + +Missing + + +=head1 REPORTING BUGS + +B is part of tangetools. Report bugs to . + + +=head1 AUTHOR + +Copyright (C) 2016 Ole Tange http://ole.tange.dk + + +=head1 LICENSE + +Copyright (C) 2013 Free Software Foundation, Inc. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +at your option any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +=head2 Documentation license I + +Permission is granted to copy, distribute and/or modify this documentation +under the terms of the GNU Free Documentation License, Version 1.3 or +any later version published by the Free Software Foundation; with no +Invariant Sections, with no Front-Cover Texts, and with no Back-Cover +Texts. A copy of the license is included in the file fdl.txt. + +=head2 Documentation license II + +You are free: + +=over 9 + +=item B + +to copy, distribute and transmit the work + +=item B + +to adapt the work + +=back + +Under the following conditions: + +=over 9 + +=item B + +You must attribute the work in the manner specified by the author or +licensor (but not in any way that suggests that they endorse you or +your use of the work). + +=item B + +If you alter, transform, or build upon this work, you may distribute +the resulting work only under the same, similar or a compatible +license. + +=back + +With the understanding that: + +=over 9 + +=item B + +Any of the above conditions can be waived if you get permission from +the copyright holder. + +=item B + +Where the work or any of its elements is in the public domain under +applicable law, that status is in no way affected by the license. + +=item B + +In no way are any of the following rights affected by the license: + +=over 9 + +=item * + +Your fair dealing or fair use rights, or other applicable +copyright exceptions and limitations; + +=item * + +The author's moral rights; + +=item * + +Rights other persons may have either in the work itself or in +how the work is used, such as publicity or privacy rights. + +=back + +=item B + +For any reuse or distribution, you must make clear to others the +license terms of this work. + +=back + +A copy of the full license is included in the file as cc-by-sa.txt. + +=head1 DEPENDENCIES + +B uses Perl. + + +=head1 SEE ALSO + +B(1), B(1). + +=cut + +use Getopt::Long; + +Getopt::Long::Configure("bundling","require_order"); + +GetOptions( + "debug|D=s" => \$opt::D, + "version" => \$opt::version, + "verbose|v" => \$opt::verbose, + "B|byte-offset" => \$opt::byte_offset, + "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks, + "d|dictionary-order" => \$opt::dictionary_order, + "f|ignore-case" => \$opt::ignore_case, + "g|general-numeric-sort" => \$opt::general_numeric_sort, + "i|ignore-nonprinting" => \$opt::ignore_nonprinting, + "M|month-sort" => \$opt::month_sort, + "h|human-numeric-sort" => \$opt::human_numeric_sort, + "n|numeric-sort" => \$opt::numeric_sort, + "r|reverse" => \$opt::reverse, + "sort=s" => \$opt::sort, + "V|version-sort" => \$opt::version_sort, + "k|key=s" => \@opt::key, + "t|field-separator=s" => \$opt::field_separator, + "z|zero-terminated" => \$opt::zero_terminated, + ); +$Global::progname = "bsearch"; +$Global::version = 20160712; +if($opt::version) { + version(); + exit 0; +} +if($opt::zero_terminated) { $/ = "\0"; } + +my $file = shift; + +for my $key (@ARGV) { + print bsearch($file,$key); +} + +sub bsearch { + my $file = shift; + my $key = shift; + my $min = 0; + my $max = -s $file; + + if(not open ($fh, "<", $file)) { + error("Cannot open '$file'"); + exit 1; + } + my $line; + while($max - $min > 1) { + $middle = int(($max + $min)/2); + seek($fh,$middle,0) or die; + my $half = <$fh>; + if(eof($fh) + or + compare(($line = <$fh>),$key) >= 0) { + $max = $middle; + } else { + $min = $middle; + } + } + seek($fh,$max,0) or die; + $line = <$fh>; + if(compare($line,$key) >= 0) { + if($opt::byte_offset) { + return "0\n"; + } else { + # The very first line + return ""; + } + } else { + if($opt::byte_offset) { + return tell($fh)."\n"; + } else { + return $line; + } + } +} + +sub compare { + my ($a,$b) = @_; + if($opt::reverse) { + ($a,$b) = ($b,$a); + } + if($opt::ignore_case) { + $a = uc($a); + $b = uc($b); + } + if($opt::numeric_sort) { + return $a <=> $b; + } elsif($opt::numascii) { + return $a <=> $b or $a cmp $b; + } else { + return $a cmp $b; + } +} + +sub status { + my @w = @_; + my $fh = $Global::status_fd || *STDERR; + print $fh map { ($_, "\n") } @w; + flush $fh; +} + +sub status_no_nl { + my @w = @_; + my $fh = $Global::status_fd || *STDERR; + print $fh @w; + flush $fh; +} + +sub warning { + my @w = @_; + my $prog = $Global::progname || "parallel"; + status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w); +} + +sub error { + my @w = @_; + my $prog = $Global::progname || "parallel"; + status(map { ($prog.": Error: ". $_); } @w); +} + +sub die_bug { + my $bugid = shift; + print STDERR + ("$Global::progname: This should not happen. You have found a bug.\n", + "Please contact and include:\n", + "* The version number: $Global::version\n", + "* The bugid: $bugid\n", + "* The command line being run\n", + "* The files being read (put the files on a webserver if they are big)\n", + "\n", + "If you get the error on smaller/fewer files, please include those instead.\n"); + ::wait_and_exit(255); +} + +sub version { + # Returns: N/A + print join("\n", + "GNU $Global::progname $Global::version", + "Copyright (C) 2016", + "Ole Tange and Free Software Foundation, Inc.", + "License GPLv3+: GNU GPL version 3 or later ", + "This is free software: you are free to change and redistribute it.", + "GNU $Global::progname comes with no warranty.", + "", + "Web site: http://www.gnu.org/software/${Global::progname}\n", + "When using programs that use GNU Parallel to process data for publication", + "please cite as described in 'parallel --citation'.\n", + ); +} diff --git a/bsearch/regressiontest b/bsearch/regressiontest new file mode 100755 index 0000000..6d77046 --- /dev/null +++ b/bsearch/regressiontest @@ -0,0 +1,44 @@ +#!/bin/bash + +test_tmp=`tempfile` +export test_tmp + +test_n() { + tmp=${test_tmp}_n + true > $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo > $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1.000000000 > $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1.000000000 > $tmp + echo 2 >> $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1 > $tmp + echo 2.000000000 >> $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1.000000000 > $tmp + echo 2 >> $tmp + echo 3 >> $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1 > $tmp + echo 2.000000000 >> $tmp + echo 3 >> $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + echo 1 > $tmp + echo 2 >> $tmp + echo 3.000000000 >> $tmp + xargs < $tmp + bsearch -n $tmp 0 2 2.1 100000 + rm $tmp +} + + +export -f $(compgen -A function | grep test_) +compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1' diff --git a/transpose/transpose b/transpose/transpose index 6ddf6e5..35da6e8 100755 --- a/transpose/transpose +++ b/transpose/transpose @@ -7,37 +7,43 @@ use File::Temp; my $delimiter = shift; my $buffer = shift; +$delimiter ||= ","; # Use at most 1000M before flushing -$buffer ||= 1000_000_000; +$buffer ||= "1000M"; +$buffer = multiply_binary_prefix($buffer); # Perl makes the buffer baloon to 10 times the requested value $buffer /= 10; +# max_col_size will be lowered after first line read. $max_col_size = $buffer; my $delimiter_regexp = $delimiter; $delimiter_regexp =~ s/(\W)/\\$1/g; my @current; -my $last_t = 0; +my $col_no_last_line = 0; my $lineno = 0; my %col; while(<>) { chomp; - # Split current line into columns - @current = split /$delimiter_regexp/o, $_; - my $t = 0; - map { - push(@{$col{$t}},$_); - $col_size{$t} += length $_; - if($col_size{$t} > $max_col_size) { - flush(\%col,$t); - $col_size{$t} = 0; + my $col_no = 0; + my @to_be_flushed = (); + map { + push(@{$col{$col_no}},$_); + $col_size{$col_no} += length $_; + if($col_size{$col_no} > $max_col_size) { + push @to_be_flushed, $col_no; + $col_size{$col_no} = 0; } - $t++; - } @current; - if($t != $last_t) { - if(0 == $last_t) { - $last_t = $t; - $max_col_size = $buffer/$last_t; + $col_no++; + } split /$delimiter_regexp/o, $_; # This should do de-csv'ing + if(@to_be_flushed) { + flush(\%col,@to_be_flushed); + } + if($col_no != $col_no_last_line) { + if(0 == $col_no_last_line) { + # This is first time around + $col_no_last_line = $col_no; + $max_col_size = $buffer/$col_no_last_line; } else { - warning("Number of columns in line $NR: $t != $last_t\n"); + warning("Number of columns in line $NR: $col_no != $col_no_last_line\n"); } } } @@ -86,3 +92,34 @@ sub error { my @w = @_; print STDERR "transpose: Error: ", @w; } + +sub multiply_binary_prefix { + # Evalualte numbers with binary prefix + # k=10^3, m=10^6, g=10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24 + # K=2^10, M=2^20, G=2^30, T=2^40, P=2^50, E=2^70, Z=2^80, Y=2^80 + # Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80 + # ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80 + # 13G = 13*1024*1024*1024 = 13958643712 + my $s = shift; + $s =~ s/k/*1000/g; + $s =~ s/M/*1000*1000/g; + $s =~ s/G/*1000*1000*1000/g; + $s =~ s/T/*1000*1000*1000*1000/g; + $s =~ s/P/*1000*1000*1000*1000*1000/g; + $s =~ s/E/*1000*1000*1000*1000*1000*1000/g; + $s =~ s/Z/*1000*1000*1000*1000*1000*1000*1000/g; + $s =~ s/Y/*1000*1000*1000*1000*1000*1000*1000*1000/g; + $s =~ s/X/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g; + + $s =~ s/Ki?/*1024/gi; + $s =~ s/Mi?/*1024*1024/gi; + $s =~ s/Gi?/*1024*1024*1024/gi; + $s =~ s/Ti?/*1024*1024*1024*1024/gi; + $s =~ s/Pi?/*1024*1024*1024*1024*1024/gi; + $s =~ s/Ei?/*1024*1024*1024*1024*1024*1024/gi; + $s =~ s/Zi?/*1024*1024*1024*1024*1024*1024*1024/gi; + $s =~ s/Yi?/*1024*1024*1024*1024*1024*1024*1024*1024/gi; + $s =~ s/Xi?/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi; + $s = eval $s; + return $s; +} diff --git a/transpose/transpose-par.pl b/transpose/transpose-par.pl index 8102969..515704d 100755 --- a/transpose/transpose-par.pl +++ b/transpose/transpose-par.pl @@ -1,7 +1,5 @@ #!/usr/bin/perl -#!/usr/local/bin/parallel --shebang-wrap --pipe --block 10m -k --files /usr/bin/perl | xargs paste - use Text::CSV; use File::Temp qw(tempfile tempdir); @@ -32,6 +30,65 @@ while(my $l = <>) { print map { join("\t",@$_),"\n" } @table; sub guess_csv_setting { - # Based on a single line guess the csv_setting - return { binary => 1 }; + # Based on two lines guess the csv_setting + my $line = shift; + # Potential field separators + # Priority: + # \0 if both lines have the same number + # \t if both lines have the same number + my @fieldsep = (",", "\t", "\0", ":", ";", "|", "/"); + my %count; + @count{@fieldsep} = (0,0,0,0,0,0); + # Count characters + map { $count{$_}++ } split //,$line; + my @sepsort = sort { $count{$b} <=> $count{$a} } @fieldsep; + my $guessed_sep; + if($count{"\0"} > 0) { + # \0 is in the line => this is definitely the field sep + $guessed_sep = "\0"; + } elsif($count{"\t"} > 0) { + # \t is in the line => this is definitely the field sep + $guessed_sep = "\t"; + } else { + $guessed_sep = $sepsort[0]; + } + return { binary => 1, sep_char => $guessed_sep }; +} + +sub _guess_csv_setting { + # Try different csv_settings + # Return a $csv object with the best setting + my @csv_file_types = + ( { binary => 1, sep_char => "\0" }, + { binary => 1, sep_char => "\t" }, + { binary => 1, sep_char => "," }, + { binary => 1 }, + ); + + my $succesful_csv_type; + my $csv; + for my $csv_file_type (@csv_file_types) { + $csv = Text::CSV->new ( $csv_file_type ) + or die "Cannot use CSV: ($csv_file_type) ".Text::CSV->error_diag (); + $succesful_csv_type = $csv_file_type; + my $last_n_fields; + for my $line (@lines) { + if($csv->parse($line)) { + my $n_fields = ($csv->fields()); + $last_fields ||= $n_fields; + + } else{ + $succesful_csv_type = 0; + last; + } + } + + } + if(not $succesful_csv_type) { + $csv->error_diag(); + } + + $csv = Text::CSV->new ( $succesful_csv_type ) # should set binary attribute. + or die "Cannot use CSV: ".Text::CSV->error_diag (); + return($csv); } diff --git a/transpose/transpose-simple b/transpose/transpose-simple index 12c639c..baafa2b 100644 --- a/transpose/transpose-simple +++ b/transpose/transpose-simple @@ -1,3 +1,9 @@ Can it be done more simple? zcat D.gz | perl -ne 's/\s+/\n/g; open(OUT,">","out".(++$out)); print OUT' ; paste out* | pigz >Dt.gz + +Chop CSV into fields + +multi file paste + +paste out1 out2 | paste - out3 diff --git a/transpose/transposewrap.pl b/transpose/transposewrap.pl index 7055424..34320a7 100755 --- a/transpose/transposewrap.pl +++ b/transpose/transposewrap.pl @@ -1,6 +1,5 @@ #!/usr/bin/perl - use File::Temp qw(tempfile tempdir); #$Global::debug = 1; @@ -8,7 +7,7 @@ my $block = "30m"; debug("parallel --pipe --block $block -k --files -j150% transpose-par.pl\n"); my @files = `parallel --pipe --block $block -k --files -j150% transpose-par.pl`; chomp(@files); -my $tmp = File::Temp::tempdir(CLEANUP => 0); +my $tmp = File::Temp::tempdir(CLEANUP => 1); my $fifo = "$tmp/0000000"; my $cmd = "mkfifo $fifo; paste > $fifo "; my (@fifos, @args);