Merge branch 'master' of gitlab.com:ole.tange/tangetools

This commit is contained in:
Ole Tange 2016-08-13 20:41:59 +02:00
commit ee13554589
8 changed files with 572 additions and 27 deletions

View file

@ -1,12 +1,12 @@
CMD = blink histogram upsidedown tracefile timestamp rand rrm goodpasswd gitnext
CMD = blink bsearch histogram upsidedown tracefile timestamp rand rrm goodpasswd gitnext
all: blink/blink.1 goodpasswd/goodpasswd.1 histogram/histogram.1 rand/rand.1 rrm/rrm.1 timestamp/timestamp.1 tracefile/tracefile.1 upsidedown/upsidedown.1 wssh/wssh.1
all: blink/blink.1 bsearch/bsearch.1 goodpasswd/goodpasswd.1 histogram/histogram.1 rand/rand.1 rrm/rrm.1 timestamp/timestamp.1 tracefile/tracefile.1 upsidedown/upsidedown.1 wssh/wssh.1
%.1: %
pod2man $< > $@
install:
mkdir -p /usr/local/bin
parallel eval ln -sf `pwd`/*/{} /usr/local/bin/{} ::: blink reniced em field forever neno rn stdout tracefile w4it-for-port-open upsidedown histogram goodpasswd mtrr not summer timestamp transpose wssh aptsearch rand rrm gitnext
parallel eval ln -sf `pwd`/*/{} /usr/local/bin/{} ::: blink bsearch reniced em field forever neno rn stdout tracefile w4it-for-port-open upsidedown histogram goodpasswd mtrr not summer timestamp transpose wssh aptsearch rand rrm gitnext
mkdir -p /usr/local/share/man/man1
parallel ln -sf `pwd`/{} /usr/local/share/man/man1/{/} ::: */*.1

2
README
View file

@ -2,6 +2,8 @@ Tools developed by Ole Tange <ole@tange.dk>.
Probably not useful for you, but then again you never now.
bsearch - binary search through sorted text files.
em - Force emacs to run in terminal. Use xemacs if installed.
field - Split on space. Give the given field number. Supports syntax 1-3,6-

400
bsearch/bsearch Executable file
View file

@ -0,0 +1,400 @@
#!/usr/bin/perl
=head1 NAME
bsearch - binary search through sorted text files
=head1 SYNOPSIS
B<bsearch> [-nrfB] file string [string...]
=head1 DESCRIPTION
B<bsearch> searches a sorted file for a string. It outputs the
following line or the byte position of this line, which is where the
string would have been if it had been in the sorted file.
=over 9
=item B<--ignore-leading-blanks> (not implemented)
=item B<-b>
ignore leading blanks
=item B<--byte-offset>
=item B<-B>
print byte position where string would have been
=item B<--dictionary-order> (not implemented)
=item B<-d>
consider only blanks and alphanumeric characters
=item B<--debug> (not implemented)
=item B<-D>
annotate the part of the line used to sort, and warn about
questionable usage to stderr
=item B<--ignore-case>
=item B<-f>
fold lower case to upper case characters
=item B<--general-numeric-sort> (not implemented)
=item B<-g>
compare according to general numerical value
=item B<--ignore-nonprinting> (not implemented)
=item B<-i>
consider only printable characters
=item B<--month-sort> (not implemented)
=item B<-M>
compare (unknown) < 'JAN' < ... < 'DEC'
=item B<--human-numeric-sort> (not implemented)
=item B<-h>
compare human readable numbers (e.g., 2K 1G)
=item B<--key=KEYDEF> (not implemented)
=item B<-k>
sort via a key; KEYDEF gives location and type
=item B<--numeric-sort>
=item B<-n>
compare according to string numerical value
=item B<--random-sort> (not implemented)
=item B<-R>
sort by random hash of keys
=item B<--reverse>
=item B<-r>
reverse the result of comparisons
=item B<--sort=WORD> (not implemented)
sort according to WORD: general-numeric B<-g>, human-numeric B<-h>, month
B<-M>, numeric B<-n>, random B<-R>, version B<-V>
=item B<-t> (not implemented)
=item B<--field-separator=SEP>
use SEP instead of non-blank to blank transition
=item B<-z> (not implemented)
=item B<--zero-terminated>
end lines with 0 byte, not newline
=back
=head1 EXAMPLES
=head2 Missing
Missing
=head1 REPORTING BUGS
B<bsearch> is part of tangetools. Report bugs to <tools@tange.dk>.
=head1 AUTHOR
Copyright (C) 2016 Ole Tange http://ole.tange.dk
=head1 LICENSE
Copyright (C) 2013 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head2 Documentation license I
Permission is granted to copy, distribute and/or modify this documentation
under the terms of the GNU Free Documentation License, Version 1.3 or
any later version published by the Free Software Foundation; with no
Invariant Sections, with no Front-Cover Texts, and with no Back-Cover
Texts. A copy of the license is included in the file fdl.txt.
=head2 Documentation license II
You are free:
=over 9
=item B<to Share>
to copy, distribute and transmit the work
=item B<to Remix>
to adapt the work
=back
Under the following conditions:
=over 9
=item B<Attribution>
You must attribute the work in the manner specified by the author or
licensor (but not in any way that suggests that they endorse you or
your use of the work).
=item B<Share Alike>
If you alter, transform, or build upon this work, you may distribute
the resulting work only under the same, similar or a compatible
license.
=back
With the understanding that:
=over 9
=item B<Waiver>
Any of the above conditions can be waived if you get permission from
the copyright holder.
=item B<Public Domain>
Where the work or any of its elements is in the public domain under
applicable law, that status is in no way affected by the license.
=item B<Other Rights>
In no way are any of the following rights affected by the license:
=over 9
=item *
Your fair dealing or fair use rights, or other applicable
copyright exceptions and limitations;
=item *
The author's moral rights;
=item *
Rights other persons may have either in the work itself or in
how the work is used, such as publicity or privacy rights.
=back
=item B<Notice>
For any reuse or distribution, you must make clear to others the
license terms of this work.
=back
A copy of the full license is included in the file as cc-by-sa.txt.
=head1 DEPENDENCIES
B<bsearch> uses Perl.
=head1 SEE ALSO
B<grep>(1), B<sort>(1).
=cut
use Getopt::Long;
Getopt::Long::Configure("bundling","require_order");
GetOptions(
"debug|D=s" => \$opt::D,
"version" => \$opt::version,
"verbose|v" => \$opt::verbose,
"B|byte-offset" => \$opt::byte_offset,
"b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
"d|dictionary-order" => \$opt::dictionary_order,
"f|ignore-case" => \$opt::ignore_case,
"g|general-numeric-sort" => \$opt::general_numeric_sort,
"i|ignore-nonprinting" => \$opt::ignore_nonprinting,
"M|month-sort" => \$opt::month_sort,
"h|human-numeric-sort" => \$opt::human_numeric_sort,
"n|numeric-sort" => \$opt::numeric_sort,
"r|reverse" => \$opt::reverse,
"sort=s" => \$opt::sort,
"V|version-sort" => \$opt::version_sort,
"k|key=s" => \@opt::key,
"t|field-separator=s" => \$opt::field_separator,
"z|zero-terminated" => \$opt::zero_terminated,
);
$Global::progname = "bsearch";
$Global::version = 20160712;
if($opt::version) {
version();
exit 0;
}
if($opt::zero_terminated) { $/ = "\0"; }
my $file = shift;
for my $key (@ARGV) {
print bsearch($file,$key);
}
sub bsearch {
my $file = shift;
my $key = shift;
my $min = 0;
my $max = -s $file;
if(not open ($fh, "<", $file)) {
error("Cannot open '$file'");
exit 1;
}
my $line;
while($max - $min > 1) {
$middle = int(($max + $min)/2);
seek($fh,$middle,0) or die;
my $half = <$fh>;
if(eof($fh)
or
compare(($line = <$fh>),$key) >= 0) {
$max = $middle;
} else {
$min = $middle;
}
}
seek($fh,$max,0) or die;
$line = <$fh>;
if(compare($line,$key) >= 0) {
if($opt::byte_offset) {
return "0\n";
} else {
# The very first line
return "";
}
} else {
if($opt::byte_offset) {
return tell($fh)."\n";
} else {
return $line;
}
}
}
sub compare {
my ($a,$b) = @_;
if($opt::reverse) {
($a,$b) = ($b,$a);
}
if($opt::ignore_case) {
$a = uc($a);
$b = uc($b);
}
if($opt::numeric_sort) {
return $a <=> $b;
} elsif($opt::numascii) {
return $a <=> $b or $a cmp $b;
} else {
return $a cmp $b;
}
}
sub status {
my @w = @_;
my $fh = $Global::status_fd || *STDERR;
print $fh map { ($_, "\n") } @w;
flush $fh;
}
sub status_no_nl {
my @w = @_;
my $fh = $Global::status_fd || *STDERR;
print $fh @w;
flush $fh;
}
sub warning {
my @w = @_;
my $prog = $Global::progname || "parallel";
status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
}
sub error {
my @w = @_;
my $prog = $Global::progname || "parallel";
status(map { ($prog.": Error: ". $_); } @w);
}
sub die_bug {
my $bugid = shift;
print STDERR
("$Global::progname: This should not happen. You have found a bug.\n",
"Please contact <parallel\@gnu.org> and include:\n",
"* The version number: $Global::version\n",
"* The bugid: $bugid\n",
"* The command line being run\n",
"* The files being read (put the files on a webserver if they are big)\n",
"\n",
"If you get the error on smaller/fewer files, please include those instead.\n");
::wait_and_exit(255);
}
sub version {
# Returns: N/A
print join("\n",
"GNU $Global::progname $Global::version",
"Copyright (C) 2016",
"Ole Tange and Free Software Foundation, Inc.",
"License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
"This is free software: you are free to change and redistribute it.",
"GNU $Global::progname comes with no warranty.",
"",
"Web site: http://www.gnu.org/software/${Global::progname}\n",
"When using programs that use GNU Parallel to process data for publication",
"please cite as described in 'parallel --citation'.\n",
);
}

44
bsearch/regressiontest Executable file
View file

@ -0,0 +1,44 @@
#!/bin/bash
test_tmp=`tempfile`
export test_tmp
test_n() {
tmp=${test_tmp}_n
true > $tmp
bsearch -n $tmp 0 2 2.1 100000
echo > $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1.000000000 > $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1.000000000 > $tmp
echo 2 >> $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1 > $tmp
echo 2.000000000 >> $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1.000000000 > $tmp
echo 2 >> $tmp
echo 3 >> $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1 > $tmp
echo 2.000000000 >> $tmp
echo 3 >> $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
echo 1 > $tmp
echo 2 >> $tmp
echo 3.000000000 >> $tmp
xargs < $tmp
bsearch -n $tmp 0 2 2.1 100000
rm $tmp
}
export -f $(compgen -A function | grep test_)
compgen -A function | grep test_ | sort | parallel -j6 --tag -k '{} 2>&1'

View file

@ -7,37 +7,43 @@ use File::Temp;
my $delimiter = shift;
my $buffer = shift;
$delimiter ||= ",";
# Use at most 1000M before flushing
$buffer ||= 1000_000_000;
$buffer ||= "1000M";
$buffer = multiply_binary_prefix($buffer);
# Perl makes the buffer baloon to 10 times the requested value
$buffer /= 10;
# max_col_size will be lowered after first line read.
$max_col_size = $buffer;
my $delimiter_regexp = $delimiter;
$delimiter_regexp =~ s/(\W)/\\$1/g;
my @current;
my $last_t = 0;
my $col_no_last_line = 0;
my $lineno = 0;
my %col;
while(<>) {
chomp;
# Split current line into columns
@current = split /$delimiter_regexp/o, $_;
my $t = 0;
my $col_no = 0;
my @to_be_flushed = ();
map {
push(@{$col{$t}},$_);
$col_size{$t} += length $_;
if($col_size{$t} > $max_col_size) {
flush(\%col,$t);
$col_size{$t} = 0;
push(@{$col{$col_no}},$_);
$col_size{$col_no} += length $_;
if($col_size{$col_no} > $max_col_size) {
push @to_be_flushed, $col_no;
$col_size{$col_no} = 0;
}
$t++;
} @current;
if($t != $last_t) {
if(0 == $last_t) {
$last_t = $t;
$max_col_size = $buffer/$last_t;
$col_no++;
} split /$delimiter_regexp/o, $_; # This should do de-csv'ing
if(@to_be_flushed) {
flush(\%col,@to_be_flushed);
}
if($col_no != $col_no_last_line) {
if(0 == $col_no_last_line) {
# This is first time around
$col_no_last_line = $col_no;
$max_col_size = $buffer/$col_no_last_line;
} else {
warning("Number of columns in line $NR: $t != $last_t\n");
warning("Number of columns in line $NR: $col_no != $col_no_last_line\n");
}
}
}
@ -86,3 +92,34 @@ sub error {
my @w = @_;
print STDERR "transpose: Error: ", @w;
}
sub multiply_binary_prefix {
# Evalualte numbers with binary prefix
# k=10^3, m=10^6, g=10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24
# K=2^10, M=2^20, G=2^30, T=2^40, P=2^50, E=2^70, Z=2^80, Y=2^80
# Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80
# ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80
# 13G = 13*1024*1024*1024 = 13958643712
my $s = shift;
$s =~ s/k/*1000/g;
$s =~ s/M/*1000*1000/g;
$s =~ s/G/*1000*1000*1000/g;
$s =~ s/T/*1000*1000*1000*1000/g;
$s =~ s/P/*1000*1000*1000*1000*1000/g;
$s =~ s/E/*1000*1000*1000*1000*1000*1000/g;
$s =~ s/Z/*1000*1000*1000*1000*1000*1000*1000/g;
$s =~ s/Y/*1000*1000*1000*1000*1000*1000*1000*1000/g;
$s =~ s/X/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g;
$s =~ s/Ki?/*1024/gi;
$s =~ s/Mi?/*1024*1024/gi;
$s =~ s/Gi?/*1024*1024*1024/gi;
$s =~ s/Ti?/*1024*1024*1024*1024/gi;
$s =~ s/Pi?/*1024*1024*1024*1024*1024/gi;
$s =~ s/Ei?/*1024*1024*1024*1024*1024*1024/gi;
$s =~ s/Zi?/*1024*1024*1024*1024*1024*1024*1024/gi;
$s =~ s/Yi?/*1024*1024*1024*1024*1024*1024*1024*1024/gi;
$s =~ s/Xi?/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi;
$s = eval $s;
return $s;
}

View file

@ -1,7 +1,5 @@
#!/usr/bin/perl
#!/usr/local/bin/parallel --shebang-wrap --pipe --block 10m -k --files /usr/bin/perl | xargs paste
use Text::CSV;
use File::Temp qw(tempfile tempdir);
@ -32,6 +30,65 @@ while(my $l = <>) {
print map { join("\t",@$_),"\n" } @table;
sub guess_csv_setting {
# Based on a single line guess the csv_setting
return { binary => 1 };
# Based on two lines guess the csv_setting
my $line = shift;
# Potential field separators
# Priority:
# \0 if both lines have the same number
# \t if both lines have the same number
my @fieldsep = (",", "\t", "\0", ":", ";", "|", "/");
my %count;
@count{@fieldsep} = (0,0,0,0,0,0);
# Count characters
map { $count{$_}++ } split //,$line;
my @sepsort = sort { $count{$b} <=> $count{$a} } @fieldsep;
my $guessed_sep;
if($count{"\0"} > 0) {
# \0 is in the line => this is definitely the field sep
$guessed_sep = "\0";
} elsif($count{"\t"} > 0) {
# \t is in the line => this is definitely the field sep
$guessed_sep = "\t";
} else {
$guessed_sep = $sepsort[0];
}
return { binary => 1, sep_char => $guessed_sep };
}
sub _guess_csv_setting {
# Try different csv_settings
# Return a $csv object with the best setting
my @csv_file_types =
( { binary => 1, sep_char => "\0" },
{ binary => 1, sep_char => "\t" },
{ binary => 1, sep_char => "," },
{ binary => 1 },
);
my $succesful_csv_type;
my $csv;
for my $csv_file_type (@csv_file_types) {
$csv = Text::CSV->new ( $csv_file_type )
or die "Cannot use CSV: ($csv_file_type) ".Text::CSV->error_diag ();
$succesful_csv_type = $csv_file_type;
my $last_n_fields;
for my $line (@lines) {
if($csv->parse($line)) {
my $n_fields = ($csv->fields());
$last_fields ||= $n_fields;
} else{
$succesful_csv_type = 0;
last;
}
}
}
if(not $succesful_csv_type) {
$csv->error_diag();
}
$csv = Text::CSV->new ( $succesful_csv_type ) # should set binary attribute.
or die "Cannot use CSV: ".Text::CSV->error_diag ();
return($csv);
}

View file

@ -1,3 +1,9 @@
Can it be done more simple?
zcat D.gz | perl -ne 's/\s+/\n/g; open(OUT,">","out".(++$out)); print OUT' ; paste out* | pigz >Dt.gz
Chop CSV into fields
multi file paste
paste out1 out2 | paste - out3

View file

@ -1,6 +1,5 @@
#!/usr/bin/perl
use File::Temp qw(tempfile tempdir);
#$Global::debug = 1;
@ -8,7 +7,7 @@ my $block = "30m";
debug("parallel --pipe --block $block -k --files -j150% transpose-par.pl\n");
my @files = `parallel --pipe --block $block -k --files -j150% transpose-par.pl`;
chomp(@files);
my $tmp = File::Temp::tempdir(CLEANUP => 0);
my $tmp = File::Temp::tempdir(CLEANUP => 1);
my $fifo = "$tmp/0000000";
my $cmd = "mkfifo $fifo; paste > $fifo ";
my (@fifos, @args);