release procedure updated.

parallel: --plus --onall now works.
parallel: --blocktimeout must be >= 1.
This commit is contained in:
Ole Tange 2020-05-31 16:42:04 +02:00
parent 1e7da025c4
commit b5b3d5dc3e
14 changed files with 472 additions and 126 deletions

View file

@ -123,4 +123,25 @@ testurls:
mkdir -p urls
cd urls && grep -v '(dead)' ../src/* | grep -h -Po 'https?://[^ $$<>")}]+' | perl -pe 's/(&gt|\{).*//;s/\\-/-/g;s/\\n//g;s/&amp;/&/g;s/&#.*//;'"s/'.*//" | grep -Ev 'parallel-(20)?$$|coolwebsite.biz' | sort -u | egrep -v 'example.com|##|\*\(' | parallel -j0 --timeout 33 --bar --tag --joblog joblog --retries 3 neno wget -m -l1 -Q1 '{=$$_=Q($$_)=}'
reconf:
rm -fr autom4te.cache aclocal.m4 config.h config.h.in config.log Makefile.in missing install-sh
rm -rf src/Makefile.in
autoreconf --install -W gnu
./configure
make -j
sudo make install
pack_unpack_and_test_build:
echo '### Building tar.bz2'
./configure
make dist
make dist-bzip2
echo "### Unpack parallel-$(YYYYMMDD).tar.bz2"
cp parallel-$(YYYYMMDD).tar.bz2 /tmp
cd /tmp && \
tar xjf parallel-$(YYYYMMDD).tar.bz2 && \
cd parallel-$(YYYYMMDD) && \
./configure && make -j && sudo make -j install
EXTRA_DIST = CITATION CREDITS cc-by-sa.txt fdl.txt

View file

@ -871,6 +871,26 @@ testurls:
mkdir -p urls
cd urls && grep -v '(dead)' ../src/* | grep -h -Po 'https?://[^ $$<>")}]+' | perl -pe 's/(&gt|\{).*//;s/\\-/-/g;s/\\n//g;s/&amp;/&/g;s/&#.*//;'"s/'.*//" | grep -Ev 'parallel-(20)?$$|coolwebsite.biz' | sort -u | egrep -v 'example.com|##|\*\(' | parallel -j0 --timeout 33 --bar --tag --joblog joblog --retries 3 neno wget -m -l1 -Q1 '{=$$_=Q($$_)=}'
reconf:
rm -fr autom4te.cache aclocal.m4 config.h config.h.in config.log Makefile.in missing install-sh
rm -rf src/Makefile.in
autoreconf --install -W gnu
./configure
make -j
sudo make install
pack_unpack_and_test_build:
echo '### Building tar.bz2'
./configure
make dist
make dist-bzip2
echo "### Unpack parallel-$(YYYYMMDD).tar.bz2"
cp parallel-$(YYYYMMDD).tar.bz2 /tmp
cd /tmp && \
tar xjf parallel-$(YYYYMMDD).tar.bz2 && \
cd parallel-$(YYYYMMDD) && \
./configure && make -j && sudo make -j install
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:

View file

@ -1,5 +1,9 @@
Quote of the month:
GNU parallel, which works a little bit like xargs, but has a much more friendly way of handling files with spaces and automatically parallelises calls. This tool has saved me a great deal of coding because it makes it so easy to write a program which does just one part of a task and then run it in parallel with load balancing and a nice progress bar. I cannot recommend this tool enough.
https://negfeedback.blogspot.com/2020/05/indispensable-command-line-tools.html
Who needs spark when GNU Parallel exists
-- MatthijsB @MatthijsBrs@twitter
@ -57,6 +61,12 @@ Quote of the month:
=== Used ===
GNU Parallel: dead simple process-level parallelization of ad hoc
tasks. Write for a chunk, let gnu manage the splitting, permutations
and pool concurrency.
-- Nick Ursa @nickursa@twitter
I wish more command line software had example pages as robust as GNU Parallel
-- Lucidbeaming @lucidbeaming

View file

@ -10,116 +10,66 @@ Unmodified beta since last version => production
Unmodified alpha since last version => beta
Modified => alpha
== Update version ==
== Update NEWS ==
Get DOI:
https://zenodo.org/deposit/new (Reserve DOI)
configure.ac: AC_INIT([parallel], [20100422], [bug-parallel@gnu.org])
src/parallel: $Global::version = 20100422;
README: parallel-20130222
DOINO=3840974
TAG=Kraftwerk
YYYYMMDD=$(echo `yyyymmdd`-1 | bc)
YYYYMMDD=$(echo `yyyymmdd`+1 | bc)
YYYYMMDD=`yyyymmdd`
updater() {
export DOINO
export TAG
export YYYYMMDD
export DOI=10.5281/zenodo.$DOINO
export YYYY=${YYYYMMDD:0:4}
export MON=`date +%b`
export MONTH=`date +%B`
echo Tag=$TAG Date:$YYYYMMDD Year:$YYYY Mon:$MON Month:$MONTH DOI:$DOI
export TITLE="GNU Parallel $YYYYMMDD ('$TAG')"
perl -i -pe "s/20\d\d\d\d\d\d/$YYYYMMDD/" configure.ac
perl -i -pe "/version/ and s/20\d\d\d\d\d\d/$YYYYMMDD/" src/sql
perl -i -pe "/version/ and s/20\d\d\d\d\d\d/$YYYYMMDD/" src/niceload
perl -i -pe "s/parallel-20\d\d\d\d\d\d/parallel-$YYYYMMDD/" README
perl -i -pe '
# Update version 20209999
/version/ and s/20\d\d\d\d\d\d/$ENV{YYYYMMDD}/;
# Update: 10.5281/zenodo.1146014
s:10.5281/zenodo.\d+:$ENV{DOI}:;
# Update "@software{tange_2015_16303,"
s:tange_\d+_\d+:tange_$ENV{YYYY}_$ENV{DOINO}:;
# Update month = mar,
s/(month\s+=\s+)\S+,",/$1$ENV{MON},",/;
# Update title = {GNU Parallel 20200522 ('Kraftwerk')},",
/ title\s+= / and s/\{.*\}/{$ENV{TITLE}}/;
# Tange, O. (2020, May 22). GNU Parallel 20200522 ('Kraftwerk').
s/(Tange, O. .).*(.. )(GNU.*[)])/$1$ENV{YYYY}, $ENV{MONTH} 22$2$ENV{TITLE}/;
' src/parallel README
(
ppar --help
ppar --citation
grep -i 'zenodo|tange' README
) 2>&1 | grep -E '^ |^}|tange'
mv ~/.parallel/will-cite ~/.parallel/will-cite.
ppar ::: true
mv ~/.parallel/will-cite. ~/.parallel/will-cite
}
updater
=== Autoconf/automake ===
rm -fr autom4te.cache aclocal.m4 config.h config.h.in config.log Makefile.in missing install-sh
rm -rf src/Makefile.in
autoreconf --install -W gnu
./configure
make -j && sudo make install
With the same things that goes in the announce mail
== Testsuite ==
cd testsuite; make mem; make
== Update NEWS ==
== Update version ==
With the same things that goes in the announce mail
https://zenodo.org/deposit/new
== Package ==
(*) Software
(Reserve DOI)
https://orcid.org/0000-0002-6345-1437
Description
GNU Parallel is a general parallelizer to run multiple serial command line programs in parallel without changing them.
./configure
make dist
make dist-bzip2
License:
gpl v3
== Test the package ==
[Save]
YYYYMMDD=`yyyymmdd`
cp parallel-$YYYYMMDD.tar.bz2 /tmp
pushd /tmp
tar xjvf parallel-$YYYYMMDD.tar.bz2
cd parallel-$YYYYMMDD
./configure && make -j && sudo make -j install
pushd
DOINO=3841377
TAG=Kraftwerk
# Update version
. packager/releasescripts/updateversion
=== Reconfig autoconf/automake, build tar.bz2 and test it ===
make reconf &&
make pack_unpack_and_test_build
== Upload ==
YYYYMMDD=`yyyymmdd`
export YYYYMMDD
export YYYYMMDD=`yyyymmdd`
export YYYYMMDD=${YYYYMMDD:0:6}22
eval `gpg-agent --daemon`
# Takes up to 8 minutes
make upload
# Only needed for alpha:
YYYYMMDD=`yyyymmdd`
export YYYYMMDD
export YYYYMMDD=`yyyymmdd`
export YYYYMMDD=${YYYYMMDD:0:6}22
eval `gpg-agent --daemon`
# Takes up to 8 minutes
make alphaupload
== Update OpenSUSE build system ==
export YYYYMMDD=`yyyymmdd`
export YYYYMMDD=${YYYYMMDD:0:6}22
YYYYMMDD=`yyyymmdd`
export YYYYMMDD
cd ~/privat/parallel/packager/obs
find home:tange/parallel/* -type f | grep -v parallel.spec | parallel -Xj1 osc rm {}
find home:tange/parallel/* -type f |
grep -v parallel.spec |
parallel -j1 'osc rm {} || rm {}'
# This should not create new files
osc up home:tange/parallel/
make
@ -127,23 +77,6 @@ make
https://build.opensuse.org/package/show/home:tange/parallel
# Check that one .deb (Debian 5.0) and one .rpm (CentOS) end with 'succeeded'
== Download and test ==
# Only needed for alpha (part of 'make upload')
YYYYMMDD=`yyyymmdd`
pushd /tmp
rm -rf parallel-${YYYYMMDD}*
# This can take 7 minutes
#while ! wget http://ftp.gnu.org/gnu/parallel/parallel-$YYYYMMDD.tar.bz2 ; do sleep 2; done
while ! wget http://alpha.gnu.org/gnu/parallel/parallel-$YYYYMMDD.tar.bz2 ; do sleep 2; done
tar xjvf parallel-$YYYYMMDD.tar.bz2
cd parallel-$YYYYMMDD
./configure
make -j && sudo make -j install
pushd
sudo cp /usr/local/bin/parallel /usr/local/bin/parallel-$YYYYMMDD
== Update website ==
http://www.gnu.org/software/parallel/
@ -172,7 +105,8 @@ git diff
# Recheck OBS https://build.opensuse.org/package/show/home:tange/parallel
YYYYMMDD=`yyyymmdd`
export YYYYMMDD=`yyyymmdd`
export YYYYMMDD=${YYYYMMDD:0:6}22
TAG=MyTag
echo "Released as $YYYYMMDD ('$TAG')." | grep MyTag && (STOP;STOP;STOP)
echo "$TAG" | grep ' ' && (STOP;STOP;STOP)
@ -186,12 +120,16 @@ torsocks git push
torsocks git push origin $TAG
torsocks git push origin $YYYYMMDD
== Zenodo ==
Add tar.bz2 and publish.
== Update documentation ==
Update version number + 1
YYYYMMDD=$(echo `yyyymmdd`+1 | bc)
export YYYYMMDD=`yyyymmdd`
export YYYYMMDD=${YYYYMMDD:0:6}23
echo $YYYYMMDD
perl -i -pe "/version/ and s/20\d\d\d\d\d\d/$YYYYMMDD/" src/parallel
perl -i -pe "/version/ and s/20\d\d\d\d\d\d/$YYYYMMDD/" src/sql
@ -250,9 +188,9 @@ from:tange@gnu.org
to:parallel@gnu.org, bug-parallel@gnu.org
stable-bcc: Jesse Alama <jessealama@fastmail.fm>
Subject: GNU Parallel 20200522 ('Kraftwerk') released <<[stable]>>
Subject: GNU Parallel 20200622 ('SpaceX') released <<[stable]>>
GNU Parallel 20200522 ('') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
GNU Parallel 20200622 ('') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
<<No new functionality was introduced so this is a good candidate for a stable release.>>
@ -262,23 +200,14 @@ Quote of the month:
New in this release:
* While running a job $PARALLEL_JOBSLOT is the jobslot of the job. It is equal to {%} unless the job is being retried. See {%} for details.
* While running a job $PARALLEL_SSHLOGIN is the sshlogin line with number of cores removed. E.g. '4//usr/bin/specialssh user@host' becomes: '/usr/bin/specialssh user@host'
* While running a job $PARALLEL_SSHHOST is the host part of an sshlogin line. E.g. '4//usr/bin/specialssh user@host' becomes: 'host'
* --plus activates the replacement strings {slot} = $PARALLEL_JOBSLOT, {sshlogin} = $PARALLEL_SSHLOGIN, {host} = $PARALLEL_SSHHOST
*
* Bug fixes and man page updates.
News about GNU Parallel:
* Portable Batch System (PBS) & GNU Parallel - Running a Program Multiple Times in Parallel https://www.youtube.com/watch?v=6ccbWu6Befo
* GNU Parallel przykład https://www.youtube.com/watch?v=gs_wG4Kt2G4
* demo of LINUX APP - GNU PARALLEL - running multiple Gstreamer webcam .sh scripts with only 1 command https://www.youtube.com/watch?v=trQuA_wmWjg
https://negfeedback.blogspot.com/2020/05/indispensable-command-line-tools.html
Get the book: GNU Parallel 2018 http://www.lulu.com/shop/ole-tange/gnu-parallel-2018/paperback/product-23558902.html

View file

@ -1,6 +1,6 @@
<directory name="parallel" rev="255" srcmd5="0350108161455748495042e7633c6418" vrev="1">
<entry md5="4072b609d162de95219a3fb0df78bf1d" mtime="1584821918" name="parallel-20200322.tar.bz2" size="2060154" />
<entry md5="8a15e171a8a11c43f468c5099db2127f" mtime="1584821918" name="parallel.spec" size="4701" />
<entry md5="beee7117ae920f6dd83ab99216b78a1c" mtime="1584821918" name="parallel_20200322.dsc" size="556" />
<entry md5="f212a11e0cc6722f750f16814a072022" mtime="1584821919" name="parallel_20200322.tar.gz" size="2243420" />
<directory name="parallel" rev="262" srcmd5="046ea7264891de22a876dd572d1b1052" vrev="3">
<entry md5="75065cd90f9f178fdc23362ae1e7c4ec" mtime="1590269378" name="parallel-20200522.tar.bz2" size="2107406" />
<entry md5="1e1fc7e397d31455eb6e353abdcfa2f1" mtime="1590269118" name="parallel.spec" size="4876" />
<entry md5="985bad0a4ff52a3683f2b68e0e51ce5f" mtime="1590269119" name="parallel_20200522.dsc" size="556" />
<entry md5="5ae40f2bfdc5f44d3afd7480e8cd13ff" mtime="1590269121" name="parallel_20200522.tar.gz" size="2289865" />
</directory>

View file

@ -13,9 +13,7 @@ You can find more about GNU Parallel at: http://www.gnu.org/s/parallel/&#13;
&#13;
Watch the intro video on http://www.youtube.com/playlist?list=PL284C9FF2488BC6D1 or walk through the tutorial http://www.gnu.org/software/parallel/parallel_tutorial.html&#13;
&#13;
When using GNU Parallel for a publication please cite:&#13;
&#13;
O. Tange (2011): GNU Parallel - The Command-Line Power Tool, ;login: The USENIX Magazine, February 2011:42-47.&#13;
When using GNU Parallel for a publication please cite as per: 'parallel --citation'&#13;
&#13;
&#13;
= About GNU SQL =&#13;

View file

@ -1,7 +1,7 @@
Summary: Shell tool for executing jobs in parallel
Name: parallel
Version: 20200322
Version: 20200522
Release: 1.3
License: GPL-3.0-or-later
Group: Productivity/File utilities
@ -66,6 +66,7 @@ rm $RPM_BUILD_ROOT%{_docdir}/sem.html
rm $RPM_BUILD_ROOT%{_docdir}/sql.html
rm $RPM_BUILD_ROOT%{_docdir}/parcat.html
rm $RPM_BUILD_ROOT%{_docdir}/parset.html
rm $RPM_BUILD_ROOT%{_docdir}/parsort.html
rm $RPM_BUILD_ROOT%{_docdir}/parallel.texi
rm $RPM_BUILD_ROOT%{_docdir}/env_parallel.texi
rm $RPM_BUILD_ROOT%{_docdir}/parallel_tutorial.texi
@ -77,6 +78,7 @@ rm $RPM_BUILD_ROOT%{_docdir}/sem.texi
rm $RPM_BUILD_ROOT%{_docdir}/sql.texi
rm $RPM_BUILD_ROOT%{_docdir}/parcat.texi
rm $RPM_BUILD_ROOT%{_docdir}/parset.texi
rm $RPM_BUILD_ROOT%{_docdir}/parsort.texi
rm $RPM_BUILD_ROOT%{_docdir}/parallel.pdf
rm $RPM_BUILD_ROOT%{_docdir}/env_parallel.pdf
rm $RPM_BUILD_ROOT%{_docdir}/parallel_tutorial.pdf
@ -88,6 +90,7 @@ rm $RPM_BUILD_ROOT%{_docdir}/sem.pdf
rm $RPM_BUILD_ROOT%{_docdir}/sql.pdf
rm $RPM_BUILD_ROOT%{_docdir}/parcat.pdf
rm $RPM_BUILD_ROOT%{_docdir}/parset.pdf
rm $RPM_BUILD_ROOT%{_docdir}/parsort.pdf
rm $RPM_BUILD_ROOT%{_docdir}/parallel_cheat_bw.pdf
%clean
@ -98,7 +101,7 @@ rm -rf $RPM_BUILD_ROOT
/usr/bin/*
/usr/share/man/man1/*
/usr/share/man/man7/*
%doc README NEWS src/parallel.html src/env_parallel.html src/parallel_tutorial.html src/parallel_design.html src/parallel_alternatives.html src/parallel_book.html src/sem.html src/sql.html src/parcat.html src/parset.html src/niceload.html src/parallel.texi src/env_parallel.texi src/parallel_tutorial.texi src/parallel_design.texi src/parallel_alternatives.texi src/parallel_book.texi src/niceload.texi src/sem.texi src/sql.texi src/parcat.texi src/parset.texi src/parallel.pdf src/env_parallel.pdf src/parallel_tutorial.pdf src/parallel_design.pdf src/parallel_alternatives.pdf src/parallel_book.pdf src/niceload.pdf src/sem.pdf src/sql.pdf src/parcat.pdf src/parset.pdf src/parallel_cheat_bw.pdf
%doc README NEWS src/parallel.html src/env_parallel.html src/parallel_tutorial.html src/parallel_design.html src/parallel_alternatives.html src/parallel_book.html src/sem.html src/sql.html src/parcat.html src/parset.html src/parsort.html src/niceload.html src/parallel.texi src/env_parallel.texi src/parallel_tutorial.texi src/parallel_design.texi src/parallel_alternatives.texi src/parallel_book.texi src/niceload.texi src/sem.texi src/sql.texi src/parcat.texi src/parset.texi src/parsort.texi src/parallel.pdf src/env_parallel.pdf src/parallel_tutorial.pdf src/parallel_design.pdf src/parallel_alternatives.pdf src/parallel_book.pdf src/niceload.pdf src/sem.pdf src/sql.pdf src/parcat.pdf src/parset.pdf src/parsort.pdf src/parallel_cheat_bw.pdf
%changelog
* Sat Jan 22 2011 Ole Tange

View file

@ -0,0 +1,58 @@
#!/bin/bash
updater() {
export DOINO
export TAG
export YYYYMMDD=`yyyymmdd`
export DOI=10.5281/zenodo.$DOINO
export YYYY=${YYYYMMDD:0:4}
export YYYYMM=${YYYYMMDD:0:6}
export YYYYMMDD=${YYYYMM}22
export MON=`date +%b`
export MONTH=`date +%B`
echo Tag=$TAG Date:$YYYYMMDD Year:$YYYY Mon:$MON Month:$MONTH DOI:$DOI
export TITLE="GNU Parallel $YYYYMMDD ('$TAG')"
if [ -z "$DOINO" ] ; then
echo '*** Set DOINO and try again ***'
echo 'https://zenodo.org/deposit/new (Reserve DOI)'
echo '[Save]'
echo DOINO=$(cat .last-doi.txt)
return
fi
echo "$DOINO" > .last-doi.txt
if [ -z "$TAG" ] ; then
echo '*** Set TAG and try again ***'
echo TAG=$(cat .last-tag.txt)
return
fi
echo "$TAG" > .last-tag.txt
perl -i -pe "s/20\d\d\d\d\d\d/$YYYYMMDD/" configure.ac
perl -i -pe "/version/ and s/20\d\d\d\d\d\d/$YYYYMMDD/" src/sql
perl -i -pe "/version/ and s/20\d\d\d\d\d\d/$YYYYMMDD/" src/niceload
perl -i -pe "s/parallel-20\d\d\d\d\d\d/parallel-$YYYYMMDD/" README
perl -i -pe '
# Update version 20209999
/version/ and s/20\d\d\d\d\d\d/$ENV{YYYYMMDD}/;
# Update: 10.5281/zenodo.1146014
s:10.5281/zenodo.\d+:$ENV{DOI}:;
# Update "@software{tange_2015_16303,"
s:tange_\d+_\d+:tange_$ENV{YYYY}_$ENV{DOINO}:;
# Update month = mar,
s/(month\s+=\s+)\S+,",/$1$ENV{MON},",/;
# Update title = {GNU Parallel 20200522 ('Kraftwerk')},",
/ title\s+= / and s/\{.*\}/{$ENV{TITLE}}/;
# Tange, O. (2020, May 22). GNU Parallel 20200522 ('Kraftwerk').
s/(Tange, O. .).*(.. )(GNU.*[)])/$1$ENV{YYYY}, $ENV{MONTH} 22$2$ENV{TITLE}/;
' src/parallel README
(
ppar --help
ppar --citation
grep -i 'zenodo|tange' README
) 2>&1 | grep -E '^ |^}|tange'
mv ~/.parallel/will-cite ~/.parallel/will-cite.
ppar ::: true
mv ~/.parallel/will-cite. ~/.parallel/will-cite
}
updater

View file

@ -948,7 +948,7 @@ sub spreadstdin() {
my $two_gb = 2**31-1;
my $blocksize = $Global::blocksize;
my $in = *STDIN;
my $timeout = ::multiply_time_units($opt::blocktimeout);
my $timeout = $Global::blocktimeout;
my $header = find_header(\$buf,$in);
my $anything_written;
@ -1762,6 +1762,13 @@ sub parse_options(@) {
if(defined $opt::max_args) {
$Global::max_number_of_args = $opt::max_args;
}
if(defined $opt::blocktimeout) {
$Global::blocktimeout = int(multiply_time_units($opt::blocktimeout));
if($Global::blocktimeout < 1) {
::error("--block-timeout must be at least 1");
wait_and_exit(255);
}
}
if(defined $opt::timeout) {
$Global::timeoutq = TimeoutQueue->new($opt::timeout);
}
@ -4535,6 +4542,7 @@ sub onall($@) {
((defined $opt::linebuffer) ? "--linebuffer" : ""),
((defined $opt::max_chars) ? "--max-chars ".$opt::max_chars : ""),
((defined $opt::plain) ? "--plain" : ""),
((defined $opt::plus) ? "--plus" : ""),
((defined $opt::retries) ? "--retries ".$opt::retries : ""),
((defined $opt::timeout) ? "--timeout ".$opt::timeout : ""),
((defined $opt::ungroup) ? "-u" : ""),

View file

@ -2910,7 +2910,6 @@ most likely do what is needed.
=back
=head1 EXAMPLE: Working as xargs -n1. Argument appending
GNU B<parallel> can work similar to B<xargs -n1>.
@ -3144,6 +3143,17 @@ If B<my_program> fails a red FAIL will be printed followed by the failing
command; otherwise a green OK will be printed followed by the command.
=head1 EXAMPLE: Continously show the latest line of output
It can be useful to monitor the output of running jobs.
This shows the most recent output line until a job finishes. After
which the output of the job is printed in full:
parallel '{} | tee >(cat >&3)' ::: 'command 1' 'command 2' \
3> >(perl -ne '$|=1;chomp;printf"%.'$COLUMNS's\r",$_." "x100')
=head1 EXAMPLE: Log rotate
Log rotation renames a logfile to an extension with a higher number:

View file

@ -2616,6 +2616,8 @@ https://pypi.org/project/papply/ (Last checked: 2020-04)
=head2 Todo
https://gitlab.com/netikras/bthread
https://github.com/JeiKeiLim/simple_distribute_job
https://github.com/reggi/pkgrun

278
src/parsort Executable file
View file

@ -0,0 +1,278 @@
#!/usr/bin/perl
=pod
=head1 NAME
parsort - Sort (big files) in parallel
=head1 SYNOPSIS
B<parsort> I<options for sort>
=head1 DESCRIPTION
B<parsort> uses GNU B<sort> to sort in parallel. It works just like
B<sort> but faster on inputs with more than 1 M lines, if you have a
multicore machine.
Hopefully these ideas will make it into GNU Sort in the future.
=head1 EXAMPLE
Sort files:
parsort *.txt > sorted.txt
Sort stdin (standard input) numerically:
cat numbers | parsort -n > sorted.txt
=head1 PERFORMANCE
B<parsort> is faster on files, because these can be read in parallel.
On a 48 core machine you should see a speedup of 3x over B<sort>.
=head1 AUTHOR
Copyright (C) 2020 Ole Tange,
http://ole.tange.dk and Free Software Foundation, Inc.
=head1 LICENSE
Copyright (C) 2012 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head1 DEPENDENCIES
B<parsort> uses B<sort>, B<bash>, B<parallel>, and B<mbuffer>.
=head1 SEE ALSO
B<sort>
=cut
use strict;
use Getopt::Long;
use POSIX qw(mkfifo);
Getopt::Long::Configure("bundling","require_order");
my @ARGV_before = @ARGV;
GetOptions(
"debug|D" => \$opt::D,
"version" => \$opt::version,
"verbose|v" => \$opt::verbose,
"b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
"d|dictionary-order" => \$opt::dictionary_order,
"f|ignore-case" => \$opt::ignore_case,
"g|general-numeric-sort" => \$opt::general_numeric_sort,
"i|ignore-nonprinting" => \$opt::ignore_nonprinting,
"M|month-sort" => \$opt::month_sort,
"h|human-numeric-sort" => \$opt::human_numeric_sort,
"n|numeric-sort" => \$opt::numeric_sort,
"N|numascii" => \$opt::numascii,
"r|reverse" => \$opt::reverse,
"R|random-sort" => \$opt::random_sort,
"sort=s" => \$opt::sort,
"V|version-sort" => \$opt::version_sort,
"k|key=s" => \@opt::key,
"t|field-separator=s" => \$opt::field_separator,
"z|zero-terminated" => \$opt::zero_terminated,
"files0-from=s" => \$opt::files0_from,
"random-source=s" => \$opt::dummy,
"batch-size=s" => \$opt::dummy,
"check=s" => \$opt::dummy,
"c" => \$opt::dummy,
"C" => \$opt::dummy,
"compress-program=s" => \$opt::dummy,
"T|temporary-directory=s" => \$opt::dummy,
"parallel=s" => \$opt::dummy,
"u|unique" => \$opt::dummy,
"S|buffer-size=s" => \$opt::dummy,
"s|stable" => \$opt::dummy,
"help" => \$opt::dummy,
) || exit(255);
$Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
$Global::version = 20200412;
if($opt::version) { version(); exit 0; }
@Global::sortoptions = @ARGV_before[0..($#ARGV_before-$#ARGV-1)];
#if($opt::zero_terminated) { $/ = "\0"; }
$ENV{'TMPDIR'} ||= "/tmp";
sub merge {
# Input:
# @cmd = commands to 'cat' (part of) a file
my @cmd = @_;
chomp(@cmd);
while($#cmd > 0) {
my @tmp;
while($#cmd >= 0) {
my $a = shift @cmd;
my $b = shift @cmd;
$a &&= "<($a)";
$b &&= "<($b)";
# Ignore errors from mbuffer - it gives errors when a pipe is closed
push @tmp, "sort -m @Global::sortoptions $a $b | ".buffer();
}
@cmd = @tmp;
}
return @cmd;
}
sub sort_files {
# Input is files
my @files = @_;
# Let GNU Parallel generate the commands to read parts of files
# The commands split at \n and there will be at least one for each CPU thread
open(my $par,"-|",qw(parallel --pipepart --block -1 --dryrun -vv sort),
@Global::sortoptions, '::::', @files) || die;
my @cmd = merge(<$par>);
close $par;
# The command uses <(...) so it is incompatible with /bin/sh
open(my $bash,"|-","bash") || die;
print $bash @cmd;
close $bash;
}
sub sort_stdin {
# Input is stdin
# Spread the input between n processes that each sort
# n = number of CPU threads
my $numthreads = `parallel --number-of-threads`;
my @fifos = map { tmpfifo() } 1..$numthreads;
map { mkfifo($_,0600) } @fifos;
# This trick removes the fifo as soon as it is connected in the other end
# (rm fifo; ...) < fifo
my @cmd = map { "(rm $_; sort @Global::sortoptions) < $_" } @fifos;
@cmd = merge(@cmd);
if(fork) {
} else {
exec(qw(parallel -j),$numthreads,
# 1M 30M = 43s
# 3M 30M = 59s
# 300k 30M = 40-45s
# 100k 30M = 47s
# 500k 30M = 44s
# 300k 10M = 41-45s
# 256k 10M = 42-44s
# 300k 3M = 42-45s
# 300k - = 47s
# 286k is the best mean value after testing 250..350
qw(--block 286k --pipe --roundrobin ),buffer(),qw(> {} :::),@fifos);
}
# The command uses <(...) so it is incompatible with /bin/sh
open(my $bash,"|-","bash") || die;
print $bash @cmd;
close $bash;
}
sub tmpname {
# Select a name that does not exist
# Do not create the file as it may be used for creating a socket (by tmux)
# Remember the name in $Global::unlink to avoid hitting the same name twice
my $name = shift;
my($tmpname);
if(not -w $ENV{'TMPDIR'}) {
if(not -e $ENV{'TMPDIR'}) {
::error("Tmpdir '$ENV{'TMPDIR'}' does not exist.","Try 'mkdir $ENV{'TMPDIR'}'");
} else {
::error("Tmpdir '$ENV{'TMPDIR'}' is not writable.","Try 'chmod +w $ENV{'TMPDIR'}'");
}
::wait_and_exit(255);
}
do {
$tmpname = $ENV{'TMPDIR'}."/".$name.
join"", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..5);
} while(-e $tmpname);
return $tmpname;
}
sub tmpfifo {
# Find an unused name and mkfifo on it
my $tmpfifo = tmpname("psort");
mkfifo($tmpfifo,0600);
return $tmpfifo;
}
{
my $buffer;
sub buffer {
if(not defined $buffer) {
if(which("mbuffker")) {
# Use mbuffer if installed
# 30M = 43s
# 10M = 41-45s
# 3M = 42-45s
# Ignore errors from mbuffer - it gives errors when a pipe is closed
$buffer = "mbuffer -v0 -q -m 30M";
} else {
$buffer = "cat";
}
}
return $buffer;
}
}
sub which {
# Input:
# @programs = programs to find the path to
# Returns:
# @full_path = full paths to @programs. Nothing if not found
my @which;
for my $prg (@_) {
push(@which, grep { not -d $_ and -x $_ }
map { $_."/".$prg } split(":",$ENV{'PATH'}));
if($prg =~ m:/:) {
# Including path
push(@which, grep { not -d $_ and -x $_ } $prg);
}
}
return wantarray ? @which : $which[0];
}
if(@ARGV) {
sort_files(@ARGV);
} elsif(length $opt::files0_from) {
$/="\0";
open(my $fh,"<",$opt::files0_from) || die;
my @files = <$fh>;
chomp(@files);
sort_files(@files);
} else {
sort_stdin();
}
# Test
# -z
# OK: cat bigfile | parsort
# OK: parsort -k4n files*.txt
# OK: parsort files*.txt
# OK: parsort "file with space"

View file

@ -168,6 +168,12 @@ par_onall_transfer() {
echo Cleanup failed
}
par_--onall_--plus() {
echo '### Test --plus is respected with --onall/--nonall'
parallel -S bash@lo --onall --plus echo {host} ::: OK
parallel -S bash@lo --nonall --plus echo {host}
}
par_remote_load() {
echo '### Test --load remote'
ssh parallel@lo 'seq 10 | parallel --nice 19 --timeout 15 -j0 -qN0 perl -e while\(1\)\{\ \}' &

View file

@ -1,6 +1,9 @@
echo TODO
TODO
## echo '### Test --trc --basefile --/./--foo7 :/./:foo8 " "/./" "foo9 ./foo11/./foo11'
par_--onall_--plus ### Test --plus is respected with --onall/--nonall
par_--onall_--plus lo
par_--onall_--plus lo
par_PARALLEL_SSH_function ### use function as $PARALLEL_SSH
par_PARALLEL_SSH_function Run through FOOSSH?
par_PARALLEL_SSH_function FOOSSH