mirror of
https://git.savannah.gnu.org/git/parallel.git
synced 2024-11-21 21:47:54 +00:00
parallel: With many --sqlworkers minimize the number of duplicates.
This commit is contained in:
parent
b5b3d5dc3e
commit
3cd3f75200
|
@ -188,7 +188,7 @@ from:tange@gnu.org
|
||||||
to:parallel@gnu.org, bug-parallel@gnu.org
|
to:parallel@gnu.org, bug-parallel@gnu.org
|
||||||
stable-bcc: Jesse Alama <jessealama@fastmail.fm>
|
stable-bcc: Jesse Alama <jessealama@fastmail.fm>
|
||||||
|
|
||||||
Subject: GNU Parallel 20200622 ('SpaceX') released <<[stable]>>
|
Subject: GNU Parallel 20200622 ('SpaceX/Floyd') released <<[stable]>>
|
||||||
|
|
||||||
GNU Parallel 20200622 ('') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
|
GNU Parallel 20200622 ('') <<[stable]>> has been released. It is available for download at: http://ftpmirror.gnu.org/parallel/
|
||||||
|
|
||||||
|
@ -201,12 +201,18 @@ Quote of the month:
|
||||||
New in this release:
|
New in this release:
|
||||||
|
|
||||||
*
|
*
|
||||||
|
https://www.slideshare.net/hoffmanlab/gnu-parallel-194030490
|
||||||
|
https://www.openskysoftware.com/site-credits.htm
|
||||||
|
|
||||||
|
|
||||||
* Bug fixes and man page updates.
|
* Bug fixes and man page updates.
|
||||||
|
|
||||||
News about GNU Parallel:
|
News about GNU Parallel:
|
||||||
|
|
||||||
|
https://bioinformaticsworkbook.org/Appendix/GNUparallel/GNU_parallel_examples.html#gsc.tab=0
|
||||||
|
|
||||||
|
http://pdebuyl.be/blog/2020/gnu-parallel-for-simulations.html
|
||||||
|
|
||||||
https://negfeedback.blogspot.com/2020/05/indispensable-command-line-tools.html
|
https://negfeedback.blogspot.com/2020/05/indispensable-command-line-tools.html
|
||||||
|
|
||||||
Get the book: GNU Parallel 2018 http://www.lulu.com/shop/ole-tange/gnu-parallel-2018/paperback/product-23558902.html
|
Get the book: GNU Parallel 2018 http://www.lulu.com/shop/ole-tange/gnu-parallel-2018/paperback/product-23558902.html
|
||||||
|
|
70
src/parallel
70
src/parallel
|
@ -12299,8 +12299,13 @@ sub new($) {
|
||||||
my $dbh;
|
my $dbh;
|
||||||
if($driver eq "CSV") {
|
if($driver eq "CSV") {
|
||||||
# CSV does not use normal dsn
|
# CSV does not use normal dsn
|
||||||
$dbh = DBI->connect("dbi:CSV:", "", "", { f_dir => "$database", })
|
if(-d $database) {
|
||||||
or die $DBI::errstr;
|
$dbh = DBI->connect("dbi:CSV:", "", "", { f_dir => "$database", })
|
||||||
|
or die $DBI::errstr;
|
||||||
|
} else {
|
||||||
|
::error("$database is not a directory.");
|
||||||
|
::wait_and_exit(255);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
$dbh = DBI->connect($dsn, $userid, $password,
|
$dbh = DBI->connect($dsn, $userid, $password,
|
||||||
{ RaiseError => 1, AutoInactiveDestroy => 1 })
|
{ RaiseError => 1, AutoInactiveDestroy => 1 })
|
||||||
|
@ -12311,7 +12316,6 @@ sub new($) {
|
||||||
$dbh->{'RaiseError'} = 1;
|
$dbh->{'RaiseError'} = 1;
|
||||||
$dbh->{'ShowErrorStatement'} = 1;
|
$dbh->{'ShowErrorStatement'} = 1;
|
||||||
$dbh->{'HandleError'} = sub {};
|
$dbh->{'HandleError'} = sub {};
|
||||||
|
|
||||||
if(not defined $options{'table'}) {
|
if(not defined $options{'table'}) {
|
||||||
::error("The DBURL ($dburl) must contain a table.");
|
::error("The DBURL ($dburl) must contain a table.");
|
||||||
::wait_and_exit(255);
|
::wait_and_exit(255);
|
||||||
|
@ -12650,26 +12654,58 @@ sub insert_records($) {
|
||||||
0, @$record_ref[1..$#$record_ref]);
|
0, @$record_ref[1..$#$record_ref]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
sub get_record($) {
|
sub get_record($) {
|
||||||
my $self = shift;
|
my $self = shift;
|
||||||
my @retval;
|
my @retval;
|
||||||
my $table = $self->table();
|
my $table = $self->table();
|
||||||
my @v_cols = map { ", V$_" } (1..$self->max_number_of_args());
|
my @v_cols = map { ", V$_" } (1..$self->max_number_of_args());
|
||||||
my $v = $self->get("SELECT Seq, Command @v_cols FROM $table ".
|
my $rand = "Reserved-".$$.rand();
|
||||||
"WHERE Exitval = -1000 ORDER BY Seq LIMIT 1;");
|
my $v;
|
||||||
if($v->[0]) {
|
my $more_pending;
|
||||||
my $val_ref = $v->[0];
|
|
||||||
# Mark record as taken
|
do {
|
||||||
my $seq = shift @$val_ref;
|
if($self->{'driver'} eq "CSV") {
|
||||||
# Save the sequence number to use when running the job
|
# Sub SELECT is not supported in CSV
|
||||||
$SQL::next_seq = $seq;
|
# So to minimize the race condition below select a job at random
|
||||||
$self->update("SET Exitval = ? WHERE Seq = ".$seq, -1220);
|
my $r = $self->get("SELECT Seq, Command @v_cols FROM $table ".
|
||||||
my @command = split /\257 /, shift @$val_ref;
|
"WHERE Exitval = -1000 LIMIT 100;");
|
||||||
$SQL::command_ref = \@command;
|
$v = [ sort { rand() > 0.5 } @$r ];
|
||||||
for (@$val_ref) {
|
} else {
|
||||||
push @retval, Arg->new($_);
|
# Avoid race condition where multiple workers get the same job
|
||||||
|
# by setting Stdout to a unique string
|
||||||
|
# (SELECT * FROM (...) AS dummy) is needed due to sillyness in MySQL
|
||||||
|
$self->update("SET Stdout = ?,Exitval = ? ".
|
||||||
|
"WHERE Seq = (".
|
||||||
|
" SELECT * FROM (".
|
||||||
|
" SELECT min(Seq) FROM $table WHERE Exitval = -1000".
|
||||||
|
" ) AS dummy".
|
||||||
|
") AND Exitval = -1000;", $rand, -1210);
|
||||||
|
# If a parallel worker overwrote the unique string this will get nothing
|
||||||
|
$v = $self->get("SELECT Seq, Command @v_cols FROM $table ".
|
||||||
|
"WHERE Stdout = ?;", $rand);
|
||||||
}
|
}
|
||||||
}
|
if($v->[0]) {
|
||||||
|
my $val_ref = $v->[0];
|
||||||
|
# Mark record as taken
|
||||||
|
my $seq = shift @$val_ref;
|
||||||
|
# Save the sequence number to use when running the job
|
||||||
|
$SQL::next_seq = $seq;
|
||||||
|
$self->update("SET Exitval = ? WHERE Seq = ".$seq, -1220);
|
||||||
|
# Command is encoded with '\257 space' as splitting char
|
||||||
|
my @command = split /\257 /, shift @$val_ref;
|
||||||
|
$SQL::command_ref = \@command;
|
||||||
|
for (@$val_ref) {
|
||||||
|
push @retval, Arg->new($_);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
# If the record was updated by another job in parallel,
|
||||||
|
# then we may not be done, so see if there are more jobs pending
|
||||||
|
$more_pending =
|
||||||
|
$self->get("SELECT Seq FROM $table WHERE Exitval = ?;", -1210);
|
||||||
|
}
|
||||||
|
} while (not $v->[0] and $more_pending->[0]);
|
||||||
|
|
||||||
if(@retval) {
|
if(@retval) {
|
||||||
return \@retval;
|
return \@retval;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -2380,8 +2380,11 @@ E.g.
|
||||||
sql:oracle://scott:tiger@ora.example.com/xe/parjob
|
sql:oracle://scott:tiger@ora.example.com/xe/parjob
|
||||||
postgresql://scott:tiger@pg.example.com/pgdb/parjob
|
postgresql://scott:tiger@pg.example.com/pgdb/parjob
|
||||||
pg:///parjob
|
pg:///parjob
|
||||||
sqlite3:///pardb/parjob.sqlite
|
sqlite3:///%2Ftmp%2Fpardb.sqlite/parjob
|
||||||
csv:///%2Ftmp%2Fmydir/jobtable
|
csv:///%2Ftmp%2Fpardb/parjob
|
||||||
|
|
||||||
|
Notice how / in the path of sqlite and CVS must be encoded as
|
||||||
|
%2F. Except the last / in CSV which must be a /.
|
||||||
|
|
||||||
It can also be an alias from ~/.sql/aliases:
|
It can also be an alias from ~/.sql/aliases:
|
||||||
|
|
||||||
|
|
|
@ -6,9 +6,11 @@
|
||||||
export SQLITE=sqlite3:///%2Frun%2Fshm%2Fparallel.db
|
export SQLITE=sqlite3:///%2Frun%2Fshm%2Fparallel.db
|
||||||
export PG=pg://`whoami`:`whoami`@lo/`whoami`
|
export PG=pg://`whoami`:`whoami`@lo/`whoami`
|
||||||
export MYSQL=mysql://`whoami`:`whoami`@lo/`whoami`
|
export MYSQL=mysql://`whoami`:`whoami`@lo/`whoami`
|
||||||
|
export CSV=csv:///%2Frun%2Fshm%2Fcsv
|
||||||
|
|
||||||
export DEBUG=false
|
export DEBUG=false
|
||||||
rm /run/shm/parallel.db
|
rm -f /run/shm/parallel.db
|
||||||
|
mkdir -p /run/shm/csv
|
||||||
|
|
||||||
p_showsqlresult() {
|
p_showsqlresult() {
|
||||||
SERVERURL=$1
|
SERVERURL=$1
|
||||||
|
@ -125,7 +127,8 @@ export -f $(compgen -A function | egrep 'p_|par_')
|
||||||
# Tested that -j0 in parallel is fastest (up to 15 jobs)
|
# Tested that -j0 in parallel is fastest (up to 15 jobs)
|
||||||
compgen -A function | grep par_ | sort |
|
compgen -A function | grep par_ | sort |
|
||||||
stdout parallel -vj5 -k --tag --joblog /tmp/jl-`basename $0` p_wrapper \
|
stdout parallel -vj5 -k --tag --joblog /tmp/jl-`basename $0` p_wrapper \
|
||||||
:::: - ::: \$MYSQL \$PG \$SQLITE | perl -pe 's/tbl\d+/TBL99999/gi;' |
|
:::: - ::: \$MYSQL \$PG \$SQLITE \$CSV |
|
||||||
|
perl -pe 's/tbl\d+/TBL99999/gi;' |
|
||||||
perl -pe 's/(from TBL99999 order) .*/$1/g' |
|
perl -pe 's/(from TBL99999 order) .*/$1/g' |
|
||||||
perl -pe 's/ *\b'"$hostname"'\b */hostname/g' |
|
perl -pe 's/ *\b'"$hostname"'\b */hostname/g' |
|
||||||
grep -v -- --------------- |
|
grep -v -- --------------- |
|
||||||
|
|
45
testsuite/tests-to-run/parallel-local-sql02.sh
Normal file
45
testsuite/tests-to-run/parallel-local-sql02.sh
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# GNU Parallel SQL tests
|
||||||
|
# The tests must be able to run in parallel
|
||||||
|
|
||||||
|
export SQLITE=sqlite3:///%2Frun%2Fshm%2Fparallel.db
|
||||||
|
export PG=pg://`whoami`:`whoami`@lo/`whoami`
|
||||||
|
export MYSQL=mysql://`whoami`:`whoami`@lo/`whoami`
|
||||||
|
export CSV=csv:///%2Frun%2Fshm%2Fcsv
|
||||||
|
|
||||||
|
rm -f /run/shm/parallel.db
|
||||||
|
mkdir -p /run/shm/csv
|
||||||
|
|
||||||
|
par_few_duplicate_run() {
|
||||||
|
echo '### With many workers there will be some duplicates'
|
||||||
|
TABLE=TBL$RANDOM
|
||||||
|
DBURL="$1"/$TABLE
|
||||||
|
parallel --sqlmaster $DBURL echo ::: {1..100}
|
||||||
|
lines=$( (
|
||||||
|
parallel --sqlworker $DBURL &
|
||||||
|
parallel --sqlworker $DBURL &
|
||||||
|
parallel --sqlworker $DBURL &
|
||||||
|
parallel --sqlworker $DBURL &
|
||||||
|
wait
|
||||||
|
) | wc -l)
|
||||||
|
if [ $lines -gt 105 ] ; then
|
||||||
|
echo Error: $lines are more than 5% duplicates
|
||||||
|
else
|
||||||
|
echo OK
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
hostname=`hostname`
|
||||||
|
export -f $(compgen -A function | egrep 'p_|par_')
|
||||||
|
# Tested that -j0 in parallel is fastest (up to 15 jobs)
|
||||||
|
compgen -A function | grep par_ | sort |
|
||||||
|
stdout parallel -vj5 -k --tag --joblog /tmp/jl-`basename $0` {1} {2} \
|
||||||
|
:::: - ::: \$CSV \$MYSQL \$PG \$SQLITE |
|
||||||
|
perl -pe 's/tbl\d+/TBL99999/gi;' |
|
||||||
|
perl -pe 's/(from TBL99999 order) .*/$1/g' |
|
||||||
|
perl -pe 's/ *\b'"$hostname"'\b */hostname/g' |
|
||||||
|
grep -v -- --------------- |
|
||||||
|
perl -pe 's/ *\bhost\b */host/g' |
|
||||||
|
perl -pe 's/ +/ /g'
|
||||||
|
|
12
testsuite/wanted-results/parallel-local-sql02
Normal file
12
testsuite/wanted-results/parallel-local-sql02
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
par_few_duplicate_run $CSV par_few_duplicate_run $CSV
|
||||||
|
par_few_duplicate_run $CSV ### With many workers there will be some duplicates
|
||||||
|
par_few_duplicate_run $CSV OK
|
||||||
|
par_few_duplicate_run $MYSQL par_few_duplicate_run $MYSQL
|
||||||
|
par_few_duplicate_run $MYSQL ### With many workers there will be some duplicates
|
||||||
|
par_few_duplicate_run $MYSQL OK
|
||||||
|
par_few_duplicate_run $PG par_few_duplicate_run $PG
|
||||||
|
par_few_duplicate_run $PG ### With many workers there will be some duplicates
|
||||||
|
par_few_duplicate_run $PG OK
|
||||||
|
par_few_duplicate_run $SQLITE par_few_duplicate_run $SQLITE
|
||||||
|
par_few_duplicate_run $SQLITE ### With many workers there will be some duplicates
|
||||||
|
par_few_duplicate_run $SQLITE OK
|
Loading…
Reference in a new issue