diff --git a/configure b/configure index 98f4b841..13daddd0 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.67 for parallel 20110126. +# Generated by GNU Autoconf 2.67 for parallel 20110130. # # Report bugs to . # @@ -551,8 +551,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='parallel' PACKAGE_TARNAME='parallel' -PACKAGE_VERSION='20110126' -PACKAGE_STRING='parallel 20110126' +PACKAGE_VERSION='20110130' +PACKAGE_STRING='parallel 20110130' PACKAGE_BUGREPORT='bug-parallel@gnu.org' PACKAGE_URL='' @@ -1168,7 +1168,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures parallel 20110126 to adapt to many kinds of systems. +\`configure' configures parallel 20110130 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1234,7 +1234,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of parallel 20110126:";; + short | recursive ) echo "Configuration of parallel 20110130:";; esac cat <<\_ACEOF @@ -1301,7 +1301,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -parallel configure 20110126 +parallel configure 20110130 generated by GNU Autoconf 2.67 Copyright (C) 2010 Free Software Foundation, Inc. @@ -1318,7 +1318,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by parallel $as_me 20110126, which was +It was created by parallel $as_me 20110130, which was generated by GNU Autoconf 2.67. Invocation command line was $ $0 $@ @@ -2133,7 +2133,7 @@ fi # Define the identity of the package. PACKAGE='parallel' - VERSION='20110126' + VERSION='20110130' cat >>confdefs.h <<_ACEOF @@ -2684,7 +2684,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by parallel $as_me 20110126, which was +This file was extended by parallel $as_me 20110130, which was generated by GNU Autoconf 2.67. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -2746,7 +2746,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -parallel config.status 20110126 +parallel config.status 20110130 configured by $0, generated by GNU Autoconf 2.67, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 68e7fb87..1fe239ce 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([parallel], [20110126], [bug-parallel@gnu.org]) +AC_INIT([parallel], [20110130], [bug-parallel@gnu.org]) AM_INIT_AUTOMAKE([-Wall -Werror foreign]) AC_CONFIG_HEADERS([config.h]) AC_CONFIG_FILES([ diff --git a/doc/FUTURE_IDEAS b/doc/FUTURE_IDEAS index 3424cf36..e00e1d08 100644 --- a/doc/FUTURE_IDEAS +++ b/doc/FUTURE_IDEAS @@ -85,7 +85,7 @@ Prøv fieldsep: Find eet tegn, som optræder det samme antal gange i alle linjer Prøv klyngesep: Find den samme klynge tegn, som står samme antal gange i alle linjer (' | ' sep) Fjern whitespace før og efter colonne -hvis der er n af tegn A og 2n af tegn B, så +hvis der er n af tegn A og 2n af tegn B, så a | b | c @@ -112,6 +112,160 @@ colsep = [sepchars]{no_of_sepchars} # TODO compute how many can be transferred within max_line_length # TODO Unittest with filename that is long and requires a lot of quoting. Will there be to many +=head1 YouTube video --pipe + +cp parallel.fasta parallel.mbox lucene.tar + +# GNU Parallel 20110205 - The FOSDEM Release + +I assume you already know GNU Parallel. If not watch the intro video first. + +GNU Parallel has so far worked similar to xargs. But the FOSDEM +release of GNU Parallel introduces the new --pipe option. It makes GNU +Parallel work similar to tee. + +tee pipes a copy of the output to a file and a copy to another +program. + +seq 1 5 | tee myfile | wc + +Here it pipes a copy to the file myfile and to the command word count (wc). + +cat myfile + +and we can see the content is what we expected. + +The pipe option of GNU Parallel splits data into records and pipes a +block of records into a program: + +seq 1 5 | parallel --pipe -N1 cat';' echo foo + +Here we pipe each number to the command cat and print foo after +running cat. + +GNU Parallel does this in parallel starting one process per cpu so the +order may be different because one command may finish before another. + +# RECORD SEPARATORS + +GNU Parallel splits on record separators. + +seq 1 5 | parallel --pipe --recend '\n' -N1 cat';' echo foo + +This is the example we saw before: the record separator is \n and +--recend will keep the record separator at the end of the record. + +But if what your records start with a record separator? Here is a +fast-a file: + +cat parallel.fasta + +Every record start with a >. To keep that with the record you use +--recstart: + +cat parallel.fasta | parallel --pipe --recstart '>' -N1 cat';' echo foo + +But what if you have both? mbox files is an example that has both an +ending and starting separator: + +cat parallel.mbox | +parallel --pipe --recend '\n\n' --recstart 'From ' -N1 cat';' echo foo | less # + +The two newlines are staying with the email before and the From_ stays with the next record. + +GNU Parallel cannot guarantee the first record will start with record +separator and it cannot guarantee the last record will end with record +separator. You will simply get what is first and last. + +But GNU Parallel _does_ guarantee that it will only split at record +separators. + +# NUMBER OF RECORDS + +So far we have used -N1. This tells GNU Parallel to pipe one record to +the program. + +seq 1 5 | parallel --pipe -N1 cat';' echo foo + +But we can choose any amount: + +seq 1 5 | parallel --pipe -N3 cat';' echo foo + +This will pipe blocks of 3 records into cat and if there is not enough the last will only get two. + +# BLOCKSIZE + +However, using -N is inefficient. It is faster to pipe a full block into the program. + +cat /usr/share/dict/words | parallel --pipe --blocksize 500k wc + +We here tell GNU Parallel to split on \n and pipe blocks of 500 KB to +wc. 1 MB is the default: + +cat /usr/share/dict/words | parallel --pipe wc + +If you just have a bunch of bytes you often do not care about the +record separator. To split input into chunks you can disable the +--recend + +ls -l lucene.tar +cat lucene.tar | parallel --pipe --recend '' -k gzip > lucene.tar.gz + +GNU Parallel will then split the input into 1 MB blocks; pipe that to +gzip and -k will make sure the order of the output is kept before +saving to the tar.gz file. + +The beauty of gzip is that if you concatenate two gzip files it is a +valid gzip file. To test this: + +tar tvzf lucene.tar.gz # + +# OUTPUT AS FILE + +Sometimes the output of GNU Parallel cannot be mixed in a single stream like this: + +seq 1 10 | shuf | parallel --pipe -N 3 sort -n + +As you can see each block of 3 is sorted but the whole output is not sorted. + +GNU Parallel can give the output in file. GNU Parallel will the list the +files created: + +seq 1 10 | shuf | parallel --pipe --files -N 3 sort -n + +Each of these files contains a sorted block: + +cat + +Sort has -m to merge sorted files into a sorted stream + +seq 1 10 | shuf | parallel --pipe --files -N 3 sort -n | parallel -mj1 sort -nm + +-m will append all the files behind the sort command and the -j1 will +make sure we only run one command. The only part missing now is +cleaning up by removing the temporary files. We can do that by +appending rm + +seq 1 10 | shuf | parallel --pipe --files -N 3 sort -n | +parallel -mj1 sort -nm {} ";"rm {} + + +# Thank you for watching +# +# If you like GNU Parallel: +# * Post this video on forums/blogs/Twitter/Facebook/Linkedin +# * Join the mailing list http://lists.gnu.org/mailman/listinfo/parallel +# * Request or write a review for your favourite magazine +# * Request or build a package for your favourite distribution +# * Invite me for your next conference (Contact http://ole.tange.dk) +# +# If GNU Parallel saves you money: +# * (Have your company) donate to FSF https://my.fsf.org/donate/ +# +# Find GNU Parallel at http://www.gnu.org/software/parallel/ + + + =head1 YouTube video2 Converting of WAV files to MP3 using GNU Parallel @@ -177,7 +331,7 @@ it easy to distribute jobs to these. terminal2: ssh parallel@vh2.pi.dk ssh parallel@vh2.pi.dk - and + and PS1="\[\e[7m\]GNU Parallel:\[\033[01;34m\]\w\[\033[00m\e[27m\]$ " gunzip logs/*gz @@ -362,7 +516,7 @@ find . -name '*.gz' | parallel -j+0 "zcat {} | bzip2 >{.}.bz2 && rm {}" # Create a directory for each zip-file and unzip it in that dir parallel 'mkdir {.}; cd {.}; unzip ../{}' ::: *.zip -# Convert all *.mp3 in subdirs to *.ogg running +# Convert all *.mp3 in subdirs to *.ogg running # one process per CPU core on local computer and server2 find . -name '*.mp3' | parallel --trc {.}.ogg -j+0 -S server2,: \ 'mpg321 -w - {} | oggenc -q0 - -o {.}.ogg' diff --git a/doc/release_new_version b/doc/release_new_version index 0f3e4947..7f742988 100644 --- a/doc/release_new_version +++ b/doc/release_new_version @@ -66,12 +66,14 @@ echo put parallel-$YYYYMMDD.tar.bz2{,.sig,*asc} | ncftp ftp://ftp-upload.gnu.org == Download and test == pushd /tmp +rm parallel-$YYYYMMDD.tar.bz2 wget http://ftp.gnu.org/gnu/parallel/parallel-$YYYYMMDD.tar.bz2 #wget http://alpha.gnu.org/gnu/parallel/parallel-$YYYYMMDD.tar.bz2 tar xjvf parallel-$YYYYMMDD.tar.bz2 cd parallel-$YYYYMMDD ./configure make -j && sudo make -j install +pushd == Update OpenSUSE build system == @@ -138,17 +140,22 @@ cc:Peter Simons , Sandro Cazzaniga , ryoichiro.suzuki@gmail.com,kerick@shiftedbit.net, Christian Faulhammer , Ryoichiro Suzuki -Subject: GNU Parallel 2011XXXX released +Subject: GNU Parallel 20110205 (FOSDEM release) released -GNU Parallel 2011XXXX has been released. It is available for -download at: http://ftp.gnu.org/gnu/parallel/ +GNU Parallel 20110205 (the FOSDEM release) has been released. It is +available for download at: http://ftp.gnu.org/gnu/parallel/ This is a major release as the --pipe option introduces a new way to -work. To learn about --pipe see the example section for uses of ---pipe. +work. GNU Parallel has so far been similar to xargs, with --pipe it +becomes somewhat similar to tee. To learn about --pipe see the example +section for uses of --pipe. But rest assured: No old functionality is changed. +If you want GNU Parallel to be part of your favourite distribution +contact the people maintaining the distribution (complaining on +Twitter is not enough). + New in this release: * --pipe splits piped data into blocks. Each block is piped to a @@ -170,12 +177,22 @@ New in this release: followed immediately by a start of a record. This is useful if either recend or recstart can occur in the middle of a record. +* --remove-rec-sep removes the string matched by --recstart and + --recend. + +* --regexp will make GNU Parallel treat --recstart and --recend as + regular expressions. + * --output-as-files will put the output of the programs into files and instead of giving the output GNU Parallel will output the name of these files. -* -N set the number of records to read. If used with --blocksize - the block read will at most be --blocksize. +* -N if used with --pipe sets the number of records to read. + +* GNU Parallel was presented at FOSDEM. + +* Article in USENIX Magazine ;login: (print) + http://www.usenix.org/publications/login/2011-02/ * GNU Parallel is now on ohloh.net. Thanks to Wim Muskee. https://www.ohloh.net/p/gnu-parallel diff --git a/src/niceload b/src/niceload index 97b637b7..282aa150 100755 --- a/src/niceload +++ b/src/niceload @@ -236,7 +236,7 @@ B(1), B(1) use strict; use Getopt::Long; $Global::progname="niceload"; -$Global::version = 20110126; +$Global::version = 20110130; Getopt::Long::Configure("bundling","require_order"); get_options_from_array(\@ARGV) || die_usage(); if($::opt_version) { diff --git a/src/parallel b/src/parallel index 00079818..a66f553a 100755 --- a/src/parallel +++ b/src/parallel @@ -88,17 +88,17 @@ sub spreadstdin { # If both --recstart and --recend is given then both must match $recstart = $::opt_recstart; $recend = $::opt_recend; - $recerror = "Warning: --recend and --recstart unmatched. Is --blocksize too small?"; + $recerror = "parallel: Warning: --recend and --recstart unmatched. Is --blocksize too small?"; } elsif(defined($::opt_recstart)) { # If --recstart is given it must match start of record $recstart = $::opt_recstart; $recend = ""; - $recerror = "Warning: --recstart unmatched. Is --blocksize too small?"; + $recerror = "parallel: Warning: --recstart unmatched. Is --blocksize too small?"; } elsif(defined($::opt_recend)) { # If --recend is given then it must match end of record $recstart = ""; $recend = $::opt_recend; - $recerror = "Warning: --recend unmatched. Is --blocksize too small?"; + $recerror = "parallel: Warning: --recend unmatched. Is --blocksize too small?"; } while(read(STDIN,substr($buf,length $buf,0),$::opt_blocksize)) { @@ -333,6 +333,7 @@ sub get_options_from_array { "pipe|spreadstdin" => \$::opt_pipe, "recstart=s" => \$::opt_recstart, "recend=s" => \$::opt_recend, + "regexp|regex" => \$::opt_regexp, "remove-rec-sep|removerecsep|rrs" => \$::opt_remove_rec_sep, "files|output-as-files|outputasfiles" => \$::opt_files, "block|block-size|blocksize=s" => \$::opt_blocksize, @@ -377,7 +378,7 @@ sub get_options_from_array { sub parse_options { # Returns: N/A # Defaults: - $Global::version = 20110126; + $Global::version = 20110130; $Global::progname = 'parallel'; $Global::infinity = 2**31; $Global::debug = 0; @@ -519,7 +520,7 @@ sub parse_options { # As we do not know the max line length on the remote machine # long commands generated by xargs may fail # If opt_N is set, it is probably safe - print STDERR ("Warning: using -X or -m with --sshlogin may fail\n"); + print STDERR ("parallel: Warning: using -X or -m with --sshlogin may fail\n"); } if(not defined $::opt_P) { @@ -1265,19 +1266,19 @@ sub parse_sshlogin { # There are no remote hosts if(defined @::opt_trc) { print $Global::original_stderr - "Warning: --trc ignored as there are no remote --sshlogin\n"; + "parallel: Warning: --trc ignored as there are no remote --sshlogin\n"; } elsif (defined $::opt_transfer) { print $Global::original_stderr - "Warning: --transfer ignored as there are no remote --sshlogin\n"; + "parallel: Warning: --transfer ignored as there are no remote --sshlogin\n"; } elsif (defined @::opt_return) { print $Global::original_stderr - "Warning: --return ignored as there are no remote --sshlogin\n"; + "parallel: Warning: --return ignored as there are no remote --sshlogin\n"; } elsif (defined $::opt_cleanup) { print $Global::original_stderr - "Warning: --cleanup ignored as there are no remote --sshlogin\n"; + "parallel: Warning: --cleanup ignored as there are no remote --sshlogin\n"; } elsif (defined @::opt_basefile) { print $Global::original_stderr - "Warning: --basefile ignored as there are no remote --sshlogin\n"; + "parallel: Warning: --basefile ignored as there are no remote --sshlogin\n"; } } } @@ -1898,20 +1899,20 @@ sub processes_available_by_system_limit { # Give the user a warning. He can press Ctrl-C if this # sucks. print $Global::original_stderr - ("Warning: Starting 10 extra processes takes > 2 sec.\n", + ("parallel: Warning: Starting 10 extra processes takes > 2 sec.\n", "Consider adjusting -j. Press CTRL-C to stop.\n"); $slow_spawining_warning_printed = 1; } } if($system_limit < $wanted_processes and not $more_filehandles) { print $Global::original_stderr - ("Warning: Only enough filehandles to run ", + ("parallel: Warning: Only enough filehandles to run ", $system_limit, " jobs in parallel. ", "Raising ulimit -n may help\n"); } if($system_limit < $wanted_processes and $max_system_proc_reached) { print $Global::original_stderr - ("Warning: Only enough available processes to run ", + ("parallel: Warning: Only enough available processes to run ", $system_limit, " jobs in parallel.\n"); } # Cleanup: Close the files @@ -1948,7 +1949,7 @@ sub simultaneous_sshlogin_limit { if($ssh_limit < $wanted_processes) { my $serverlogin = $self->serverlogin(); print $Global::original_stderr - ("Warning: ssh to $serverlogin only allows ", + ("parallel: Warning: ssh to $serverlogin only allows ", "for $ssh_limit simultaneous logins.\n", "You may raise this by changing ", "/etc/ssh/sshd_config:MaxStartup on $serverlogin\n", @@ -2059,7 +2060,7 @@ sub ncpus { $self->{'ncpus'} = $ncpu; } else { print $Global::original_stderr - ("Warning: Could not figure out ", + ("parallel: Warning: Could not figure out ", "number of cpus on $serverlogin. Using 1\n"); $self->{'ncpus'} = 1; } @@ -2080,7 +2081,7 @@ sub no_of_cpus { if($no_of_cpus) { return $no_of_cpus; } else { - warn("Cannot figure out number of cpus. Using 1"); + warn("parallel: Cannot figure out number of cpus. Using 1"); return 1; } } @@ -2097,7 +2098,7 @@ sub no_of_cores { if($no_of_cores) { return $no_of_cores; } else { - warn("Cannot figure out number of CPU cores. Using 1"); + warn("parallel: Cannot figure out number of CPU cores. Using 1"); return 1; } } @@ -2699,7 +2700,7 @@ sub sshtransfer { $pre .= "$mkremote_workdir; rsync $rsync_opt ".::shell_quote_scalar($file)." $serverlogin:$rsync_destdir;"; } else { print $Global::original_stderr - "Warning: $file is not readable and will not be transferred\n"; + "parallel: Warning: $file is not readable and will not be transferred\n"; } } return $pre; @@ -4017,6 +4018,6 @@ sub unlock { # Keep perl -w happy -$::opt_regexp = $::opt_x = $::opt_workdir = $Semaphore::timeout = $Semaphore::wait = +$::opt_x = $::opt_workdir = $Semaphore::timeout = $Semaphore::wait = $::opt_skip_first_line = $::opt_shebang = 0 ; diff --git a/src/parallel.pod b/src/parallel.pod index 0a301f3f..d29ddcd7 100644 --- a/src/parallel.pod +++ b/src/parallel.pod @@ -28,8 +28,7 @@ If you use B today you will find GNU B very easy to use as GNU B is written to have the same options as B. If you write loops in shell, you will find GNU B may be able to replace most of the loops and make them run faster by -running several jobs simultaneously. If you use B or B you -will find GNU B will often make the command easier to read. +running several jobs simultaneously. GNU B makes sure output from the commands is the same output as you would get had you run the commands sequentially. This makes it @@ -713,8 +712,8 @@ If B<--recstart> is given I will be used to split at record start. If B<--recend> is given I will be used to split at record end. If both B<--recstart> and B<--recend> are given the string -II will have to match to find a split -position. This is useful if either I or I +II will have to match to find a split +position. This is useful if either I or I match in the middle of a record. If neither B<--recstart> nor B<--recend> are given then B<--recend> @@ -726,7 +725,7 @@ Use B<--regexp> to interpret B<--recstart> and B<--recend> as regular expressions. This is slow, however. -=item B<--regexp> (unimplimented) +=item B<--regexp> (beta test) Use B<--regexp> to interpret B<--recstart> and B<--recend> as regular expressions. This is slow, however. @@ -743,6 +742,7 @@ it to the command. Only used with B<--pipe>. + =item B<--retries> I (beta testing) If a job fails, retry it on another computer. Do this I times. If @@ -1601,6 +1601,34 @@ B As there is not a I the jobs will be evaluated by the shell. +=head1 EXAMPLE: Processing a big file using more cores + +To process a big file or some output you can use B<--pipe> to split up +the data into blocks and pipe the blocks into the processing program. + +If the program is B you can do: + +B>B + +This will split B into blocks of 1 MB and pass that to B in parallel. One B will be run per CPU core. The output of +B will be kept in order and saved to B + +B works fine if the output is appended, but some processing does +not work like that - for example sorting. For this GNU B can +put the output of each command into a file. This will sort a big file +in parallel: + +B>B + +Here B is split into blocks of around 1MB, each block ending +in '\n' (which is the default for B<--recend>). Each block is passed +to B and the output from B is saved into files. These +files are passed to the second B that runs B on the +files before it removes the files. The output is saved to +B. + + =head1 EXAMPLE: Working as mutex and counting semaphore The command B is an alias for B. @@ -1921,7 +1949,7 @@ variable $PARALLEL which takes precedence over the file =head1 PROFILE FILES -If B<--profile> set, GNU B will read the profile from that file instead of +If B<--profile> set, GNU B will read the profile from that file instead of ~/.parallel/config. Example: Profile for running every command with B<-j+0> and B diff --git a/src/sql b/src/sql index 03085e60..a24ba7f0 100755 --- a/src/sql +++ b/src/sql @@ -531,7 +531,7 @@ $Global::Initfile && unlink $Global::Initfile; exit ($err); sub parse_options { - $Global::version = 20110126; + $Global::version = 20110130; $Global::progname = 'sql'; # This must be done first as this may exec myself