parallel/src/parsort

#!/usr/bin/perl

# SPDX-FileCopyrightText: 2021-2023 Ole Tange, http://ole.tange.dk and Free Software and Foundation, Inc.
# SPDX-License-Identifier: GPL-3.0-or-later

=pod

=head1 NAME

parsort - Sort (big files) in parallel


=head1 SYNOPSIS

B<parsort> I<options for sort>


=head1 DESCRIPTION

B<parsort> uses GNU B<sort> to sort in parallel. It works just like
B<sort> but faster on inputs with more than 1 M lines, if you have a
multicore machine.

Hopefully these ideas will make it into GNU B<sort> in the future.


=head1 OPTIONS

Same as B<sort>. Except:

=over 4

=item B<--parallel=>I<N> (beta testing)

Change the number of sorts run concurrently to I<N>. I<N> will be
increased to number of files if B<parsort> is given more than I<N>
files.

=back


=head1 EXAMPLE

Sort files:

  parsort *.txt > sorted.txt

Sort stdin (standard input) numerically:

  cat numbers | parsort -n > sorted.txt


=head1 PERFORMANCE

B<parsort> is faster on files than on stdin (standard input), because
different parts of a file can be read in parallel.

On a 48 core machine you should see a speedup of 3x over B<sort>.


=head1 AUTHOR

Copyright (C) 2020-2023 Ole Tange,
http://ole.tange.dk and Free Software Foundation, Inc.


=head1 LICENSE

Copyright (C) 2012 Free Software Foundation, Inc.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.


=head1 DEPENDENCIES

B<parsort> uses B<sort>, B<bash>, and B<parallel>.


=head1 SEE ALSO

B<sort>


=cut

use strict;
use Getopt::Long;
use POSIX qw(mkfifo);

Getopt::Long::Configure("bundling","require_order");

my @ARGV_before = @ARGV;

GetOptions(
    "debug|D" => \$opt::D,
    "version" => \$opt::version,
    "verbose|v" => \$opt::verbose,
    "b|ignore-leading-blanks" => \$opt::ignore_leading_blanks,
    "d|dictionary-order" => \$opt::dictionary_order,
    "f|ignore-case" => \$opt::ignore_case,
    "g|general-numeric-sort" => \$opt::general_numeric_sort,
    "i|ignore-nonprinting" => \$opt::ignore_nonprinting,
    "M|month-sort" => \$opt::month_sort,
    "h|human-numeric-sort" => \$opt::human_numeric_sort,
    "n|numeric-sort" => \$opt::numeric_sort,
    "N|numascii" => \$opt::numascii,
    "r|reverse" => \$opt::reverse,
    "R|random-sort" => \$opt::random_sort,
    "sort=s" => \$opt::sort,
    "V|version-sort" => \$opt::version_sort,
    "k|key=s" => \@opt::key,
    "t|field-separator=s" => \$opt::field_separator,
    "z|zero-terminated" => \$opt::zero_terminated,
    "files0-from=s" => \$opt::files0_from,
    "random-source=s" => \$opt::dummy,
    "batch-size=s" => \$opt::dummy,
    "check=s" => \$opt::dummy,
    "c" => \$opt::dummy,
    "C" => \$opt::dummy,
    "compress-program=s" => \$opt::dummy,
    "T|temporary-directory=s" => \$opt::dummy,
    "parallel=s" => \$opt::parallel,
    "u|unique" => \$opt::dummy,
    "S|buffer-size=s" => \$opt::dummy,
    "s|stable" => \$opt::dummy,
    "help" => \$opt::dummy,
    ) || exit(255);
$Global::progname = ($0 =~ m:(^|/)([^/]+)$:)[1];
$Global::version = 20230423;
if($opt::version) { version(); exit 0; }
# Remove -D and --parallel=N
my @s = (grep { ! /^-D$|^--parallel=\S+$/ }
		   @ARGV_before[0..($#ARGV_before-$#ARGV-1)]);
my @sortoptions;
while(@s) {
    my $o = shift @s;
    # Remove '--parallel N'
    if($o eq "--parallel") {
	$o = shift @s;
    } else {
	push @sortoptions, $o;
    }
}
@Global::sortoptions = shell_quote(@sortoptions);
$ENV{'TMPDIR'} ||= "/tmp";

sub merge {
    # Input:
    #   @cmd = commands to 'cat' (part of) a file
    # 'cat a' 'cat b' 'cat c' =>
    # sort -m <(sort -m <(cat a) <(cat b)) <(sort -m <(cat c))
    my @cmd = @_;
    chomp(@cmd);
    while($#cmd > 0) {
	my @tmp;
	while($#cmd >= 0) {
	    my $a = shift @cmd;
	    my $b = shift @cmd;
	    $a &&= "<($a)";
	    $b &&= "<($b)";
	    # This looks like useless use of 'cat', but contrary to
	    # naive belief it increases performance dramatically.
	    push @tmp, "sort -m @Global::sortoptions $a $b | cat"
	}
	@cmd = @tmp;
    }
    return @cmd;
}

sub sort_files {
    # Input is files
    my @files = @_;
    # Let GNU Parallel generate the commands to read parts of files
    # The commands split at \n (or \0)
    # and there will be at least one for each CPU thread
    my @subopt;
    if($opt::zero_terminated) { push @subopt, qw(--recend "\0"); }
    if($opt::parallel) { push @subopt, qw(--jobs), $opt::parallel; }
    # $uniq is needed because @files could contain \n
    my $uniq = join "", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..20);
    open(my $par,"-|",qw(parallel), @subopt,
	 qw(--pipepart --block -1 --dryrun -vv sort),
	 @Global::sortoptions, $uniq, '::::', @files) || die;
    # Generated commands:
    # <file perl-catter | (sort ... $uniq )
    # Use $uniq to split into commands
    # (We cannot use \n because 'file' may contain newline)
    my @cmd = map { "$_)\n" } split(/$uniq[)]\n/, join("",<$par>));
    debug(1,@cmd);
    close $par;
    @cmd = merge(@cmd);
    # The command uses <(...) so it is incompatible with /bin/sh
    open(my $bash,"|-","bash") || die;
    print $bash @cmd;
    close $bash;
}

sub sort_stdin {
    # Input is stdin
    # Spread the input between n processes that each sort
    # n = number of CPU threads
    my $numthreads;
    chomp($numthreads = $opt::parallel || `parallel --number-of-threads`);
    my @fifos = map { tmpfifo() } 1..$numthreads;
    map { mkfifo($_,0600) } @fifos;
    # This trick removes the fifo as soon as it is connected in the other end
    # (rm fifo; ...) < fifo 
    my @cmd = (map { "(rm $_; sort @Global::sortoptions) < $_" }
	       map { Q($_) } @fifos);
    @cmd = merge(@cmd);
    if(fork) {
    } else {
	my @subopt = $opt::zero_terminated ? qw(--recend "\0") : ();
	exec(qw(parallel -0 -j), $numthreads, @subopt,
	     # 286k is the best mean value after testing 250..350
	     qw(--block 286k --pipe --roundrobin cat > {} :::),@fifos);
    }
    # The command uses <(...) so it is incompatible with /bin/sh
    open(my $bash,"|-","bash") || die;
    print $bash @cmd;
    close $bash;   
}

sub tmpname {
    # Select a name that does not exist
    # Do not create the file as it may be used for creating a socket (by tmux)
    # Remember the name in $Global::unlink to avoid hitting the same name twice
    my $name = shift;
    my($tmpname);
    if(not -w $ENV{'TMPDIR'}) {
	if(not -e $ENV{'TMPDIR'}) {
	    ::error("Tmpdir '$ENV{'TMPDIR'}' does not exist.","Try 'mkdir ".
		    Q($ENV{'TMPDIR'})."'");
	} else {
	    ::error("Tmpdir '$ENV{'TMPDIR'}' is not writable.","Try 'chmod +w ".
		    Q($ENV{'TMPDIR'})."'");
	}
	exit(255);
    }
    do {
	$tmpname = $ENV{'TMPDIR'}."/".$name.
	    join"", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..5);
    } while(-e $tmpname);
    return $tmpname;
}

sub tmpfifo {
    # Find an unused name and mkfifo on it
    my $tmpfifo = tmpname("psort");
    mkfifo($tmpfifo,0600);
    return $tmpfifo;
}

sub debug {
    # Returns: N/A
    $opt::D or return;
    @_ = grep { defined $_ ? $_ : "" } @_;
    print STDERR @_[1..$#_];
}

sub version() {
    # Returns: N/A
    print join
	("\n",
	 "GNU $Global::progname $Global::version",
	 "Copyright (C) 2020-2023 Ole Tange, http://ole.tange.dk and Free Software",
	 "Foundation, Inc.",
	 "License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>",
	 "This is free software: you are free to change and redistribute it.",
	 "GNU $Global::progname comes with no warranty.",
	 "",
	 "Web site: https://www.gnu.org/software/parallel\n",
        );
}

sub shell_quote(@) {
    # Input:
    #   @strings = strings to be quoted
    # Returns:
    #   @shell_quoted_strings = string quoted as needed by the shell
    return wantarray ? (map { Q($_) } @_) : (join" ",map { Q($_) } @_);
}

sub shell_quote_scalar_rc($) {
    # Quote for the rc-shell
    my $a = $_[0];
    if(defined $a) {
	if(($a =~ s/'/''/g)
	   +
	   ($a =~ s/[\n\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]+/'$&'/go)) {
	    # A string was replaced
	    # No need to test for "" or \0
	} elsif($a eq "") {
	    $a = "''";
	} elsif($a eq "\0") {
	    $a = "";
	}
    }
    return $a;
}

sub shell_quote_scalar_csh($) {
    # Quote for (t)csh
    my $a = $_[0];
    if(defined $a) {
	# $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g;
	# This is 1% faster than the above
	if(($a =~ s/[\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\|\; \"\!\$\&\'\202-\377]/\\$&/go)
	   +
	   # quote newline in csh as \\\n
	   ($a =~ s/[\n]/"\\\n"/go)) {
	    # A string was replaced
	    # No need to test for "" or \0
	} elsif($a eq "") {
	    $a = "''";
	} elsif($a eq "\0") {
	    $a = "";
	}
    }
    return $a;
}

sub shell_quote_scalar_default($) {
    # Quote for other shells (Bourne compatibles)
    # Inputs:
    #   $string = string to be quoted
    # Returns:
    #   $shell_quoted = string quoted as needed by the shell
    my $s = $_[0];
    if($s =~ /[^-_.+a-z0-9\/]/i) {
	$s =~ s/'/'"'"'/g; # "-quote single quotes
	$s = "'$s'";       # '-quote entire string
	$s =~ s/^''//;     # Remove unneeded '' at ends
	$s =~ s/''$//;     # (faster than s/^''|''$//g)
	return $s;
    } elsif ($s eq "") {
	return "''";
    } else {
	# No quoting needed
	return $s;
    }
}

sub shell_quote_scalar($) {
    # Quote the string so the shell will not expand any special chars
    # Inputs:
    #   $string = string to be quoted
    # Returns:
    #   $shell_quoted = string quoted as needed by the shell

    # Speed optimization: Choose the correct shell_quote_scalar_*
    # and call that directly from now on
    no warnings 'redefine';
    if($Global::cshell) {
	# (t)csh
	*shell_quote_scalar = \&shell_quote_scalar_csh;
    } elsif($Global::shell =~ m:(^|/)rc$:) {
	# rc-shell
	*shell_quote_scalar = \&shell_quote_scalar_rc;
    } else {
	# other shells
	*shell_quote_scalar = \&shell_quote_scalar_default;
    }
    # The sub is now redefined. Call it
    return shell_quote_scalar($_[0]);
}

sub Q($) {
    # Q alias for ::shell_quote_scalar
    my $ret = shell_quote_scalar($_[0]);
    no warnings 'redefine';
    *Q = \&::shell_quote_scalar;
    return $ret;
}


sub status(@) {
    my @w = @_;
    my $fh = $Global::status_fd || *STDERR;
    print $fh map { ($_, "\n") } @w;
    flush $fh;
}

sub status_no_nl(@) {
    my @w = @_;
    my $fh = $Global::status_fd || *STDERR;
    print $fh @w;
    flush $fh;
}

sub warning(@) {
    my @w = @_;
    my $prog = $Global::progname || "parsort";
    status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
}

{
    my %warnings;
    sub warning_once(@) {
	my @w = @_;
	my $prog = $Global::progname || "parsort";
	$warnings{@w}++ or
	    status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);
    }
}

sub error(@) {
    my @w = @_;
    my $prog = $Global::progname || "parsort";
    status(map { ($prog.": Error: ". $_); } @w);
}

sub die_bug($) {
    my $bugid = shift;
    print STDERR
	("$Global::progname: This should not happen. You have found a bug. ",
	 "Please follow\n",
	 "https://www.gnu.org/software/parallel/man.html#REPORTING-BUGS\n",
	 "\n",
	 "Include this in the report:\n",
	 "* The version number: $Global::version\n",
	 "* The bugid: $bugid\n",
	 "* The command line being run\n",
	 "* The files being read (put the files on a webserver if they are big)\n",
	 "\n",
	 "If you get the error on smaller/fewer files, please include those instead.\n");
    exit(255);
}

if(@ARGV) {
    sort_files(@ARGV);
} elsif(length $opt::files0_from) {
    $/="\0";
    open(my $fh,"<",$opt::files0_from) || die;
    my @files = <$fh>;
    chomp(@files);
    sort_files(@files);
} else {
    sort_stdin();
}

# Test
# -z
# OK: cat bigfile | parsort
# OK: parsort -k4n files*.txt
# OK: parsort files*.txt
# OK: parsort "file with space"
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`#!/usr/bin/perl`

parallel: Fixed bug #63586: --ll does not clear till end of line. 2022-12-20 23:57:10 +00:00			`# SPDX-FileCopyrightText: 2021-2023 Ole Tange, http://ole.tange.dk and Free Software and Foundation, Inc.`
parallel: Partial Reuse licensing support. 2021-03-22 20:16:35 +00:00			`# SPDX-License-Identifier: GPL-3.0-or-later`

release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`=pod`

			`=head1 NAME`

			`parsort - Sort (big files) in parallel`


			`=head1 SYNOPSIS`

			`B<parsort> I<options for sort>`


			`=head1 DESCRIPTION`

			`B<parsort> uses GNU B<sort> to sort in parallel. It works just like`
			`B<sort> but faster on inputs with more than 1 M lines, if you have a`
			`multicore machine.`

parallel: Fixed bug #59843: --regexp --recstart '#' fails. 2021-01-08 14:36:05 +00:00			`Hopefully these ideas will make it into GNU B<sort> in the future.`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00

parsort: --parallel=N does better what is expected. 2023-02-18 21:23:46 +00:00			`=head1 OPTIONS`

			`Same as B<sort>. Except:`

			`=over 4`

parallel: better support for wide (Asian) characters in --latest-line. 2023-03-22 19:32:12 +00:00			`=item B<--parallel=>I<N> (beta testing)`
parsort: --parallel=N does better what is expected. 2023-02-18 21:23:46 +00:00
			`Change the number of sorts run concurrently to I<N>. I<N> will be`
			`increased to number of files if B<parsort> is given more than I<N>`
			`files.`

			`=back`


release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`=head1 EXAMPLE`

			`Sort files:`

			`parsort *.txt > sorted.txt`

			`Sort stdin (standard input) numerically:`

			`cat numbers \| parsort -n > sorted.txt`


			`=head1 PERFORMANCE`

parsort: --parallel=N does better what is expected. 2023-02-18 21:23:46 +00:00			`B<parsort> is faster on files than on stdin (standard input), because`
Released as 20210722 ('Blue Unity') 2021-07-22 20:31:06 +00:00			`different parts of a file can be read in parallel.`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00
			`On a 48 core machine you should see a speedup of 3x over B<sort>.`


			`=head1 AUTHOR`

parallel: Fixed bug #63586: --ll does not clear till end of line. 2022-12-20 23:57:10 +00:00			`Copyright (C) 2020-2023 Ole Tange,`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`http://ole.tange.dk and Free Software Foundation, Inc.`


			`=head1 LICENSE`

			`Copyright (C) 2012 Free Software Foundation, Inc.`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 3 of the License, or`
			`at your option any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with this program. If not, see <http://www.gnu.org/licenses/>.`


			`=head1 DEPENDENCIES`

parsort: Performance optimized for 64 cores. 2020-10-12 23:16:03 +00:00			`B<parsort> uses B<sort>, B<bash>, and B<parallel>.`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00

			`=head1 SEE ALSO`

			`B<sort>`


			`=cut`

			`use strict;`
			`use Getopt::Long;`
			`use POSIX qw(mkfifo);`

			`Getopt::Long::Configure("bundling","require_order");`

			`my @ARGV_before = @ARGV;`
parsort: Performance optimized for 64 cores. 2020-10-12 23:16:03 +00:00
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`GetOptions(`
			`"debug\|D" => \$opt::D,`
			`"version" => \$opt::version,`
			`"verbose\|v" => \$opt::verbose,`
			`"b\|ignore-leading-blanks" => \$opt::ignore_leading_blanks,`
			`"d\|dictionary-order" => \$opt::dictionary_order,`
			`"f\|ignore-case" => \$opt::ignore_case,`
			`"g\|general-numeric-sort" => \$opt::general_numeric_sort,`
			`"i\|ignore-nonprinting" => \$opt::ignore_nonprinting,`
			`"M\|month-sort" => \$opt::month_sort,`
			`"h\|human-numeric-sort" => \$opt::human_numeric_sort,`
			`"n\|numeric-sort" => \$opt::numeric_sort,`
			`"N\|numascii" => \$opt::numascii,`
			`"r\|reverse" => \$opt::reverse,`
			`"R\|random-sort" => \$opt::random_sort,`
			`"sort=s" => \$opt::sort,`
			`"V\|version-sort" => \$opt::version_sort,`
			`"k\|key=s" => \@opt::key,`
			`"t\|field-separator=s" => \$opt::field_separator,`
			`"z\|zero-terminated" => \$opt::zero_terminated,`
			`"files0-from=s" => \$opt::files0_from,`
			`"random-source=s" => \$opt::dummy,`
			`"batch-size=s" => \$opt::dummy,`
			`"check=s" => \$opt::dummy,`
			`"c" => \$opt::dummy,`
			`"C" => \$opt::dummy,`
			`"compress-program=s" => \$opt::dummy,`
			`"T\|temporary-directory=s" => \$opt::dummy,`
parsort: --parallel=N does better what is expected. 2023-02-18 21:23:46 +00:00			`"parallel=s" => \$opt::parallel,`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`"u\|unique" => \$opt::dummy,`
			`"S\|buffer-size=s" => \$opt::dummy,`
			`"s\|stable" => \$opt::dummy,`
			`"help" => \$opt::dummy,`
			`) \|\| exit(255);`
			`$Global::progname = ($0 =~ m:(^\|/)([^/]+)$:)[1];`
parallel: --group-by --pipepart -n bug. 2023-05-06 21:42:36 +00:00			`$Global::version = 20230423;`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`if($opt::version) { version(); exit 0; }`
parsort: --parallel=N does better what is expected. 2023-02-18 21:23:46 +00:00			`# Remove -D and --parallel=N`
			`my @s = (grep { ! /^-D$\|^--parallel=\S+$/ }`
			`@ARGV_before[0..($#ARGV_before-$#ARGV-1)]);`
			`my @sortoptions;`
			`while(@s) {`
			`my $o = shift @s;`
			`# Remove '--parallel N'`
			`if($o eq "--parallel") {`
			`$o = shift @s;`
			`} else {`
			`push @sortoptions, $o;`
			`}`
			`}`
			`@Global::sortoptions = shell_quote(@sortoptions);`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`$ENV{'TMPDIR'} \|\|= "/tmp";`

			`sub merge {`
			`# Input:`
			`# @cmd = commands to 'cat' (part of) a file`
parsort: Fixed -z when sorting files. 2022-07-11 03:18:33 +00:00			`# 'cat a' 'cat b' 'cat c' =>`
			`# sort -m <(sort -m <(cat a) <(cat b)) <(sort -m <(cat c))`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`my @cmd = @_;`
			`chomp(@cmd);`
			`while($#cmd > 0) {`
			`my @tmp;`
			`while($#cmd >= 0) {`
			`my $a = shift @cmd;`
			`my $b = shift @cmd;`
			`$a &&= "<($a)";`
			`$b &&= "<($b)";`
parsort: Performance optimized for 64 cores. 2020-10-12 23:16:03 +00:00			`# This looks like useless use of 'cat', but contrary to`
			`# naive belief it increases performance dramatically.`
			`push @tmp, "sort -m @Global::sortoptions $a $b \| cat"`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`}`
			`@cmd = @tmp;`
			`}`
			`return @cmd;`
			`}`

			`sub sort_files {`
			`# Input is files`
			`my @files = @_;`
			`# Let GNU Parallel generate the commands to read parts of files`
parsort: Fixed -z when sorting files. 2022-07-11 03:18:33 +00:00			`# The commands split at \n (or \0)`
			`# and there will be at least one for each CPU thread`
parsort: --parallel=N does better what is expected. 2023-02-18 21:23:46 +00:00			`my @subopt;`
			`if($opt::zero_terminated) { push @subopt, qw(--recend "\0"); }`
			`if($opt::parallel) { push @subopt, qw(--jobs), $opt::parallel; }`
Allow $TMPDIR to contain spaces and newlines. 2023-01-30 21:51:56 +00:00			`# $uniq is needed because @files could contain \n`
			`my $uniq = join "", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..20);`
parsort: Fixed -z when sorting files. 2022-07-11 03:18:33 +00:00			`open(my $par,"-\|",qw(parallel), @subopt,`
			`qw(--pipepart --block -1 --dryrun -vv sort),`
Allow $TMPDIR to contain spaces and newlines. 2023-01-30 21:51:56 +00:00			`@Global::sortoptions, $uniq, '::::', @files) \|\| die;`
			`# Generated commands:`
			`# <file perl-catter \| (sort ... $uniq )`
			`# Use $uniq to split into commands`
			`# (We cannot use \n because 'file' may contain newline)`
			`my @cmd = map { "$_)\n" } split(/$uniq[)]\n/, join("",<$par>));`
			`debug(1,@cmd);`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`close $par;`
Allow $TMPDIR to contain spaces and newlines. 2023-01-30 21:51:56 +00:00			`@cmd = merge(@cmd);`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`# The command uses <(...) so it is incompatible with /bin/sh`
			`open(my $bash,"\|-","bash") \|\| die;`
			`print $bash @cmd;`
			`close $bash;`
			`}`

			`sub sort_stdin {`
			`# Input is stdin`
			`# Spread the input between n processes that each sort`
			`# n = number of CPU threads`
parsort: --parallel=N does better what is expected. 2023-02-18 21:23:46 +00:00			`my $numthreads;`
parallel: --group-by --pipepart -n bug. 2023-05-06 21:42:36 +00:00			chomp($numthreads = $opt::parallel \|\| `parallel --number-of-threads`);
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`my @fifos = map { tmpfifo() } 1..$numthreads;`
			`map { mkfifo($_,0600) } @fifos;`
			`# This trick removes the fifo as soon as it is connected in the other end`
			`# (rm fifo; ...) < fifo`
parsort: Fail if TMPDIR does not exist. 2021-07-09 16:39:01 +00:00			`my @cmd = (map { "(rm $_; sort @Global::sortoptions) < $_" }`
			`map { Q($_) } @fifos);`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`@cmd = merge(@cmd);`
			`if(fork) {`
			`} else {`
parsort: Fixed -z when sorting files. 2022-07-11 03:18:33 +00:00			`my @subopt = $opt::zero_terminated ? qw(--recend "\0") : ();`
Allow $TMPDIR to contain spaces and newlines. 2023-01-30 21:51:56 +00:00			`exec(qw(parallel -0 -j), $numthreads, @subopt,`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`# 286k is the best mean value after testing 250..350`
parsort: Performance optimized for 64 cores. 2020-10-12 23:16:03 +00:00			`qw(--block 286k --pipe --roundrobin cat > {} :::),@fifos);`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`}`
			`# The command uses <(...) so it is incompatible with /bin/sh`
			`open(my $bash,"\|-","bash") \|\| die;`
			`print $bash @cmd;`
			`close $bash;`
			`}`

			`sub tmpname {`
			`# Select a name that does not exist`
			`# Do not create the file as it may be used for creating a socket (by tmux)`
			`# Remember the name in $Global::unlink to avoid hitting the same name twice`
			`my $name = shift;`
			`my($tmpname);`
			`if(not -w $ENV{'TMPDIR'}) {`
			`if(not -e $ENV{'TMPDIR'}) {`
parsort: Fail if TMPDIR does not exist. 2021-07-09 16:39:01 +00:00			`::error("Tmpdir '$ENV{'TMPDIR'}' does not exist.","Try 'mkdir ".`
			`Q($ENV{'TMPDIR'})."'");`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`} else {`
parsort: Fail if TMPDIR does not exist. 2021-07-09 16:39:01 +00:00			`::error("Tmpdir '$ENV{'TMPDIR'}' is not writable.","Try 'chmod +w ".`
			`Q($ENV{'TMPDIR'})."'");`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`}`
parsort: Fail if TMPDIR does not exist. 2021-07-09 16:39:01 +00:00			`exit(255);`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`}`
			`do {`
			`$tmpname = $ENV{'TMPDIR'}."/".$name.`
			`join"", map { (0..9,"a".."z","A".."Z")[rand(62)] } (1..5);`
			`} while(-e $tmpname);`
			`return $tmpname;`
			`}`

			`sub tmpfifo {`
			`# Find an unused name and mkfifo on it`
			`my $tmpfifo = tmpname("psort");`
			`mkfifo($tmpfifo,0600);`
			`return $tmpfifo;`
			`}`

Released as 20211022 ('Sinclair') 2021-10-22 20:10:09 +00:00			`sub debug {`
			`# Returns: N/A`
			`$opt::D or return;`
			`@_ = grep { defined $_ ? $_ : "" } @_;`
			`print STDERR @_[1..$#_];`
			`}`

Fixed bug #59006: rsync version 3.2.3 is not detected correctly. 2020-08-29 20:12:46 +00:00			`sub version() {`
			`# Returns: N/A`
			`print join`
			`("\n",`
			`"GNU $Global::progname $Global::version",`
parallel: Fixed bug #63586: --ll does not clear till end of line. 2022-12-20 23:57:10 +00:00			`"Copyright (C) 2020-2023 Ole Tange, http://ole.tange.dk and Free Software",`
Fixed bug #59006: rsync version 3.2.3 is not detected correctly. 2020-08-29 20:12:46 +00:00			`"Foundation, Inc.",`
			`"License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>",`
			`"This is free software: you are free to change and redistribute it.",`
			`"GNU $Global::progname comes with no warranty.",`
			`"",`
			`"Web site: https://www.gnu.org/software/parallel\n",`
			`);`
			`}`
release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00
parsort: Fixed bug #59779: parsort does not work with space as delimiters 2021-01-02 01:01:53 +00:00			`sub shell_quote(@) {`
			`# Input:`
			`# @strings = strings to be quoted`
			`# Returns:`
			`# @shell_quoted_strings = string quoted as needed by the shell`
			`return wantarray ? (map { Q($_) } @_) : (join" ",map { Q($_) } @_);`
			`}`

			`sub shell_quote_scalar_rc($) {`
			`# Quote for the rc-shell`
			`my $a = $_[0];`
			`if(defined $a) {`
			`if(($a =~ s/'/''/g)`
			`+`
			($a =~ s/[\n\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\\|\; \"\!\$\&\'\202-\377]+/'$&'/go)) {
			`# A string was replaced`
			`# No need to test for "" or \0`
			`} elsif($a eq "") {`
			`$a = "''";`
			`} elsif($a eq "\0") {`
			`$a = "";`
			`}`
			`}`
			`return $a;`
			`}`

			`sub shell_quote_scalar_csh($) {`
			`# Quote for (t)csh`
			`my $a = $_[0];`
			`if(defined $a) {`
			# $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\>\<\~\\|\; \"\!\$\&\'\202-\377])/\\$1/g;
			`# This is 1% faster than the above`
			if(($a =~ s/[\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\^\*\<\=\>\~\\|\; \"\!\$\&\'\202-\377]/\\$&/go)
			`+`
			`# quote newline in csh as \\\n`
			`($a =~ s/[\n]/"\\\n"/go)) {`
			`# A string was replaced`
			`# No need to test for "" or \0`
			`} elsif($a eq "") {`
			`$a = "''";`
			`} elsif($a eq "\0") {`
			`$a = "";`
			`}`
			`}`
			`return $a;`
			`}`

			`sub shell_quote_scalar_default($) {`
			`# Quote for other shells (Bourne compatibles)`
			`# Inputs:`
			`# $string = string to be quoted`
			`# Returns:`
			`# $shell_quoted = string quoted as needed by the shell`
			`my $s = $_[0];`
			`if($s =~ /[^-_.+a-z0-9\/]/i) {`
			`$s =~ s/'/'"'"'/g; # "-quote single quotes`
			`$s = "'$s'"; # '-quote entire string`
			`$s =~ s/^''//; # Remove unneeded '' at ends`
			`$s =~ s/''$//; # (faster than s/^''\|''$//g)`
			`return $s;`
			`} elsif ($s eq "") {`
			`return "''";`
			`} else {`
			`# No quoting needed`
			`return $s;`
			`}`
			`}`

			`sub shell_quote_scalar($) {`
			`# Quote the string so the shell will not expand any special chars`
			`# Inputs:`
			`# $string = string to be quoted`
			`# Returns:`
			`# $shell_quoted = string quoted as needed by the shell`

			`# Speed optimization: Choose the correct shell_quote_scalar_*`
			`# and call that directly from now on`
			`no warnings 'redefine';`
			`if($Global::cshell) {`
			`# (t)csh`
			`*shell_quote_scalar = \&shell_quote_scalar_csh;`
			`} elsif($Global::shell =~ m:(^\|/)rc$:) {`
			`# rc-shell`
			`*shell_quote_scalar = \&shell_quote_scalar_rc;`
			`} else {`
			`# other shells`
			`*shell_quote_scalar = \&shell_quote_scalar_default;`
			`}`
			`# The sub is now redefined. Call it`
			`return shell_quote_scalar($_[0]);`
			`}`

			`sub Q($) {`
			`# Q alias for ::shell_quote_scalar`
			`my $ret = shell_quote_scalar($_[0]);`
			`no warnings 'redefine';`
			`*Q = \&::shell_quote_scalar;`
			`return $ret;`
			`}`


parsort: Fail if TMPDIR does not exist. 2021-07-09 16:39:01 +00:00			`sub status(@) {`
			`my @w = @_;`
			`my $fh = $Global::status_fd \|\| *STDERR;`
			`print $fh map { ($_, "\n") } @w;`
			`flush $fh;`
			`}`

			`sub status_no_nl(@) {`
			`my @w = @_;`
			`my $fh = $Global::status_fd \|\| *STDERR;`
			`print $fh @w;`
			`flush $fh;`
			`}`

			`sub warning(@) {`
			`my @w = @_;`
			`my $prog = $Global::progname \|\| "parsort";`
			`status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);`
			`}`

			`{`
			`my %warnings;`
			`sub warning_once(@) {`
			`my @w = @_;`
			`my $prog = $Global::progname \|\| "parsort";`
			`$warnings{@w}++ or`
			`status_no_nl(map { ($prog, ": Warning: ", $_, "\n"); } @w);`
			`}`
			`}`

			`sub error(@) {`
			`my @w = @_;`
			`my $prog = $Global::progname \|\| "parsort";`
			`status(map { ($prog.": Error: ". $_); } @w);`
			`}`

			`sub die_bug($) {`
			`my $bugid = shift;`
			`print STDERR`
			`("$Global::progname: This should not happen. You have found a bug. ",`
			`"Please follow\n",`
			`"https://www.gnu.org/software/parallel/man.html#REPORTING-BUGS\n",`
			`"\n",`
			`"Include this in the report:\n",`
			`"* The version number: $Global::version\n",`
			`"* The bugid: $bugid\n",`
			`"* The command line being run\n",`
			`"* The files being read (put the files on a webserver if they are big)\n",`
			`"\n",`
			`"If you get the error on smaller/fewer files, please include those instead.\n");`
			`exit(255);`
			`}`

release procedure updated. parallel: --plus --onall now works. parallel: --blocktimeout must be >= 1. 2020-05-31 14:42:04 +00:00			`if(@ARGV) {`
			`sort_files(@ARGV);`
			`} elsif(length $opt::files0_from) {`
			`$/="\0";`
			`open(my $fh,"<",$opt::files0_from) \|\| die;`
			`my @files = <$fh>;`
			`chomp(@files);`
			`sort_files(@files);`
			`} else {`
			`sort_stdin();`
			`}`

			`# Test`
			`# -z`
			`# OK: cat bigfile \| parsort`
			`# OK: parsort -k4n files*.txt`
			`# OK: parsort files*.txt`
			`# OK: parsort "file with space"`