From bbb94ab9e4e01a015480f91eec537ebb18dd3842 Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Tue, 30 Aug 2022 23:19:06 +0200 Subject: [PATCH] parallel: Moved examples to parallel_examples. --- doc/haikus | 9 + src/Makefile.am | 92 ++- src/parallel | 15 +- src/parallel.pod | 1840 +--------------------------------------------- src/pod2graph | 9 +- 5 files changed, 97 insertions(+), 1868 deletions(-) diff --git a/doc/haikus b/doc/haikus index 4b77f8f9..7e2cf8d5 100644 --- a/doc/haikus +++ b/doc/haikus @@ -4,6 +4,15 @@ Quote of the month: + I've learned a lot during my internship, but getting even slightly more proficient with GNU parallel is probably the most important thing I've learned... + -- Elijah Rippeth @terrible_coder@twitter + + reduced our backend test pipelines from 4 to 1.30 hrs. gnu parallel for the win!!! + -- Swapnil Sahu @CaffeinatedWryy@twitter + + I honestly don't know how I'd survive without @Docker and @GNU_Parallel + -- Eric Pauley @EricPauley_@twitter + Gnu parallel is indeed slick. I always try to align my data to make it possible to loop over it with a nice for loop added in a call to run the jobs in parallel then becomes super easy. I love the {1..99} syntax in bash. -- ragsofx diff --git a/src/Makefile.am b/src/Makefile.am index a77dff06..12b975ba 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -13,25 +13,29 @@ install-exec-hook: $(LN_S) parallel "$(DESTDIR)$(bindir)"/sem if DOCUMENTATION -man_MANS = parallel.1 env_parallel.1 sem.1 sql.1 niceload.1 \ - parallel_tutorial.7 parallel_book.7 parallel_design.7 \ - parallel_alternatives.7 parcat.1 parset.1 parsort.1 +man_MANS = parallel.1 env_parallel.1 sem.1 sql.1 niceload.1 \ + parallel_examples.7 parallel_tutorial.7 parallel_book.7 \ + parallel_design.7 parallel_alternatives.7 parcat.1 parset.1 \ + parsort.1 doc_DATA = parallel.html env_parallel.html sem.html sql.html \ - niceload.html parallel_tutorial.html parallel_book.html \ - parallel_design.html parallel_alternatives.html parcat.html \ - parset.html parsort.html \ - parallel.texi env_parallel.texi sem.texi sql.texi \ - niceload.texi parallel_tutorial.texi parallel_book.texi \ - parallel_design.texi parallel_alternatives.texi parcat.texi \ - parset.texi parsort.texi \ - parallel.rst env_parallel.rst sem.rst sql.rst \ - niceload.rst parallel_tutorial.rst parallel_book.rst \ + niceload.html parallel_examples.html parallel_tutorial.html \ + parallel_book.html parallel_design.html \ + parallel_alternatives.html parcat.html parset.html \ + parsort.html \ + parallel.texi env_parallel.texi sem.texi sql.texi \ + niceload.texi parallel_examples.texi parallel_tutorial.texi \ + parallel_book.texi parallel_design.texi \ + parallel_alternatives.texi parcat.texi parset.texi \ + parsort.texi \ + parallel.rst env_parallel.rst sem.rst sql.rst niceload.rst \ + parallel_examples.rst parallel_tutorial.rst parallel_book.rst \ parallel_design.rst parallel_alternatives.rst parcat.rst \ parset.rst parsort.rst \ parallel.pdf env_parallel.pdf sem.pdf sql.pdf niceload.pdf \ - parallel_tutorial.pdf parallel_book.pdf parallel_design.pdf \ - parallel_alternatives.pdf parcat.pdf parset.pdf parsort.pdf \ - parallel_cheat_bw.pdf parallel_options_map.pdf + parallel_examples.pdf parallel_tutorial.pdf parallel_book.pdf \ + parallel_design.pdf parallel_alternatives.pdf parcat.pdf \ + parset.pdf parsort.pdf parallel_cheat_bw.pdf \ + parallel_options_map.pdf endif web: sphinx @@ -54,6 +58,12 @@ env_parallel.1: env_parallel.pod && mv "$(srcdir)"/env_parallel.1n "$(srcdir)"/env_parallel.1 \ || echo "Warning: pod2man not found. Using old env_parallel.1" +parallel_examples.7: parallel_examples.pod + pod2man --release='$(PACKAGE_VERSION)' --center='$(PACKAGE_NAME)' \ + --section=7 "$(srcdir)"/parallel_examples.pod > "$(srcdir)"/parallel_examples.7n \ + && mv "$(srcdir)"/parallel_examples.7n "$(srcdir)"/parallel_examples.7 \ + || echo "Warning: pod2man not found. Using old parallel_examples.7" + parallel_tutorial.7: parallel_tutorial.pod pod2man --release='$(PACKAGE_VERSION)' --center='$(PACKAGE_NAME)' \ --section=7 "$(srcdir)"/parallel_tutorial.pod > "$(srcdir)"/parallel_tutorial.7n \ @@ -128,6 +138,13 @@ env_parallel.html: env_parallel.pod parallel.html rm -f "$(srcdir)"/pod2htm* # Depending on env_parallel.html to avoid stupid pod2html race condition +parallel_examples.html: parallel_examples.pod env_parallel.html + pod2html --title "GNU Parallel examples" "$(srcdir)"/parallel_examples.pod > "$(srcdir)"/parallel_examples.htmln \ + && mv "$(srcdir)"/parallel_examples.htmln "$(srcdir)"/parallel_examples.html \ + || echo "Warning: pod2html not found. Using old parallel_examples.html" + rm -f "$(srcdir)"/pod2htm* + +# Depending on parallel_examples.html to avoid stupid pod2html race condition parallel_tutorial.html: parallel_tutorial.pod env_parallel.html pod2html --title "GNU Parallel tutorial" "$(srcdir)"/parallel_tutorial.pod > "$(srcdir)"/parallel_tutorial.htmln \ && mv "$(srcdir)"/parallel_tutorial.htmln "$(srcdir)"/parallel_tutorial.html \ @@ -205,6 +222,10 @@ env_parallel.texi: env_parallel.pod pod2texi --output="$(srcdir)"/env_parallel.texi "$(srcdir)"/env_parallel.pod \ || echo "Warning: pod2texi not found. Using old env_parallel.texi" +parallel_examples.texi: parallel_examples.pod + pod2texi --output="$(srcdir)"/parallel_examples.texi "$(srcdir)"/parallel_examples.pod \ + || echo "Warning: pod2texi not found. Using old parallel_examples.texi" + parallel_tutorial.texi: parallel_tutorial.pod pod2texi --output="$(srcdir)"/parallel_tutorial.texi "$(srcdir)"/parallel_tutorial.pod \ || echo "Warning: pod2texi not found. Using old parallel_tutorial.texi" @@ -253,6 +274,10 @@ env_parallel.rst: env_parallel.pod ./pod2rst-fix < "$(srcdir)"/env_parallel.pod > "$(srcdir)"/env_parallel.rst \ || echo "Warning: pod2rst not found. Using old env_parallel.rst" +parallel_examples.rst: parallel_examples.pod + ./pod2rst-fix < "$(srcdir)"/parallel_examples.pod > "$(srcdir)"/parallel_examples.rst \ + || echo "Warning: pod2rst not found. Using old parallel_examples.rst" + parallel_tutorial.rst: parallel_tutorial.pod ./pod2rst-fix < "$(srcdir)"/parallel_tutorial.pod > "$(srcdir)"/parallel_tutorial.rst \ || echo "Warning: pod2rst not found. Using old parallel_tutorial.rst" @@ -301,6 +326,10 @@ env_parallel.pdf: env_parallel.pod pod2pdf --output-file "$(srcdir)"/env_parallel.pdf "$(srcdir)"/env_parallel.pod --title "GNU Parallel with environment" \ || echo "Warning: pod2pdf not found. Using old env_parallel.pdf" +parallel_examples.pdf: parallel_examples.pod + pod2pdf --output-file "$(srcdir)"/parallel_examples.pdf "$(srcdir)"/parallel_examples.pod --title "GNU Parallel Examples" \ + || echo "Warning: pod2pdf not found. Using old parallel_examples.pdf" + parallel_tutorial.pdf: parallel_tutorial.pod pod2pdf --output-file "$(srcdir)"/parallel_tutorial.pdf "$(srcdir)"/parallel_tutorial.pod --title "GNU Parallel Tutorial" \ || echo "Warning: pod2pdf not found. Using old parallel_tutorial.pdf" @@ -353,23 +382,28 @@ sem: parallel ln -fs parallel sem DISTCLEANFILES = parallel.1 env_parallel.1 sem.1 sql.1 niceload.1 \ - parallel_tutorial.7 parallel_book.7 parallel_design.7 \ - parallel_alternatives.7 parcat.1 parset.1 parsort.1 \ + parallel_examples.7 parallel_tutorial.7 parallel_book.7 \ + parallel_design.7 parallel_alternatives.7 parcat.1 parset.1 \ + parsort.1 \ parallel.html env_parallel.html sem.html sql.html \ - niceload.html parallel_tutorial.html parallel_book.html \ - parallel_design.html parallel_alternatives.html parcat.html \ - parset.html parsort.html parallel.texi env_parallel.texi \ - sem.texi sql.texi niceload.texi parallel_tutorial.texi \ + niceload.html parallel_examples.html parallel_tutorial.html \ + parallel_book.html parallel_design.html \ + parallel_alternatives.html parcat.html parset.html \ + parsort.html \ + parallel.texi env_parallel.texi sem.texi sql.texi \ + niceload.texi parallel_examples.texi parallel_tutorial.texi \ parallel_book.texi parallel_design.texi \ parallel_alternatives.texi parcat.texi parset.texi \ parsort.texi \ parallel.rst env_parallel.rst sem.rst sql.rst niceload.rst \ - parallel_tutorial.rst parallel_book.rst parallel_design.rst \ - parallel_alternatives.rst parcat.rst parset.rst parsort.rst \ + parallel_examples.rst parallel_tutorial.rst parallel_book.rst \ + parallel_design.rst parallel_alternatives.rst parcat.rst \ + parset.rst parsort.rst \ parallel.pdf env_parallel.pdf sem.pdf sql.pdf niceload.pdf \ - parallel_tutorial.pdf parallel_book.pdf parallel_design.pdf \ - parallel_alternatives.pdf parcat.pdf parset.pdf parsort.pdf \ - parallel_cheat_bw.pdf parallel_options_map.pdf + parallel_examples.pdf parallel_tutorial.pdf parallel_book.pdf \ + parallel_design.pdf parallel_alternatives.pdf parcat.pdf \ + parset.pdf parsort.pdf parallel_cheat_bw.pdf \ + parallel_options_map.pdf EXTRA_DIST = parallel sem sql niceload parcat parset parsort \ env_parallel env_parallel.ash env_parallel.bash \ @@ -377,6 +411,6 @@ EXTRA_DIST = parallel sem sql niceload parcat parset parsort \ env_parallel.ksh env_parallel.mksh env_parallel.pdksh \ env_parallel.sh env_parallel.tcsh env_parallel.zsh parcat.pod \ parset.pod sem.pod parallel.pod env_parallel.pod niceload.pod \ - parallel_tutorial.pod parallel_book.pod parallel_design.pod \ - parallel_alternatives.pod parallel_cheat_bw.fodt \ - pod2graph $(DISTCLEANFILES) + parallel_examples.pod parallel_tutorial.pod parallel_book.pod \ + parallel_design.pod parallel_alternatives.pod \ + parallel_cheat_bw.fodt pod2graph $(DISTCLEANFILES) diff --git a/src/parallel b/src/parallel index 02734b16..32d464d3 100755 --- a/src/parallel +++ b/src/parallel @@ -11402,21 +11402,28 @@ sub print_parset($) { ::debug("parset","print $Global::parset"); if($Global::parset eq "assoc") { + # Start: (done in parse_parset()) # eval "`echo 'declare -A myassoc; myassoc=( - # Each: + # Each: (done here) # [$'a\tb']=$'a\tb\tc ddd' - # End: + # End: (done in wait_and_exit()) # )'`" print '[',::Q($self->{'commandline'}-> replace_placeholders(["\257<\257>"],0,0)),']='; } elsif($Global::parset eq "array") { + # Start: (done in parse_parset()) # eval "`echo 'myassoc=( - # Each: + # Each: (done here) # $'a\tb\tc ddd' - # End: + # End: (done in wait_and_exit()) # )'`" } elsif($Global::parset eq "var") { + # Start: (done in parse_parset()) + # + # Each: (done here) # var=$'a\tb\tc ddd' + # End: (done in wait_and_exit()) + # if(not @Global::parset_vars) { ::error("Too few named destination variables"); ::wait_and_exit(255); diff --git a/src/parallel.pod b/src/parallel.pod index 9da2009d..aa116a9d 100644 --- a/src/parallel.pod +++ b/src/parallel.pod @@ -81,10 +81,10 @@ you for it. =head3 How-to -You can find a lot of Bs of use after the list of -B in B (Use B). That will -give you an idea of what GNU B is capable of, and you may -find a solution you can simply adapt to your situation. +You can find a lot of examples of use in B. They will give you an idea of what GNU B +is capable of, and you may find a solution you can simply adapt to +your situation. =head3 Reference @@ -3617,1838 +3617,10 @@ See also: B<-X> =back + =head1 EXAMPLES -=head2 EXAMPLE: Working as xargs -n1. Argument appending - -GNU B can work similar to B. - -To compress all html files using B run: - - find . -name '*.html' | parallel gzip --best - -If the file names may contain a newline use B<-0>. Substitute FOO BAR with -FUBAR in all files in this dir and subdirs: - - find . -type f -print0 | \ - parallel -q0 perl -i -pe 's/FOO BAR/FUBAR/g' - -Note B<-q> is needed because of the space in 'FOO BAR'. - - -=head2 EXAMPLE: Simple network scanner - -B can generate IP-addresses from CIDR notation. With GNU -B you can build a simple network scanner to see which -addresses respond to B: - - prips 130.229.16.0/20 | \ - parallel --timeout 2 -j0 \ - 'ping -c 1 {} >/dev/null && echo {}' 2>/dev/null - - -=head2 EXAMPLE: Reading arguments from command line - -GNU B can take the arguments from command line instead of -stdin (standard input). To compress all html files in the current dir -using B run: - - parallel gzip --best ::: *.html - -To convert *.wav to *.mp3 using LAME running one process per CPU run: - - parallel lame {} -o {.}.mp3 ::: *.wav - - -=head2 EXAMPLE: Inserting multiple arguments - -When moving a lot of files like this: B you will -sometimes get the error: - - bash: /bin/mv: Argument list too long - -because there are too many files. You can instead do: - - ls | grep -E '\.log$' | parallel mv {} destdir - -This will run B for each file. It can be done faster if B gets -as many arguments that will fit on the line: - - ls | grep -E '\.log$' | parallel -m mv {} destdir - -In many shells you can also use B: - - printf '%s\0' *.log | parallel -0 -m mv {} destdir - - -=head2 EXAMPLE: Context replace - -To remove the files I .. I you could do: - - seq -w 0 9999 | parallel rm pict{}.jpg - -You could also do: - - seq -w 0 9999 | perl -pe 's/(.*)/pict$1.jpg/' | parallel -m rm - -The first will run B 10000 times, while the last will only run -B as many times needed to keep the command line length short -enough to avoid B (it typically runs 1-2 times). - -You could also run: - - seq -w 0 9999 | parallel -X rm pict{}.jpg - -This will also only run B as many times needed to keep the command -line length short enough. - - -=head2 EXAMPLE: Compute intensive jobs and substitution - -If ImageMagick is installed this will generate a thumbnail of a jpg -file: - - convert -geometry 120 foo.jpg thumb_foo.jpg - -This will run with number-of-cpus jobs in parallel for all jpg files -in a directory: - - ls *.jpg | parallel convert -geometry 120 {} thumb_{} - -To do it recursively use B: - - find . -name '*.jpg' | \ - parallel convert -geometry 120 {} {}_thumb.jpg - -Notice how the argument has to start with B<{}> as B<{}> will include path -(e.g. running B would clearly be wrong). The command will -generate files like ./foo/bar.jpg_thumb.jpg. - -Use B<{.}> to avoid the extra .jpg in the file name. This command will -make files like ./foo/bar_thumb.jpg: - - find . -name '*.jpg' | \ - parallel convert -geometry 120 {} {.}_thumb.jpg - - -=head2 EXAMPLE: Substitution and redirection - -This will generate an uncompressed version of .gz-files next to the .gz-file: - - parallel zcat {} ">"{.} ::: *.gz - -Quoting of > is necessary to postpone the redirection. Another -solution is to quote the whole command: - - parallel "zcat {} >{.}" ::: *.gz - -Other special shell characters (such as * ; $ > < | >> <<) also need -to be put in quotes, as they may otherwise be interpreted by the shell -and not given to GNU B. - - -=head2 EXAMPLE: Composed commands - -A job can consist of several commands. This will print the number of -files in each directory: - - ls | parallel 'echo -n {}" "; ls {}|wc -l' - -To put the output in a file called .dir: - - ls | parallel '(echo -n {}" "; ls {}|wc -l) >{}.dir' - -Even small shell scripts can be run by GNU B: - - find . | parallel 'a={}; name=${a##*/};' \ - 'upper=$(echo "$name" | tr "[:lower:]" "[:upper:]");'\ - 'echo "$name - $upper"' - - ls | parallel 'mv {} "$(echo {} | tr "[:upper:]" "[:lower:]")"' - -Given a list of URLs, list all URLs that fail to download. Print the -line number and the URL. - - cat urlfile | parallel "wget {} 2>/dev/null || grep -n {} urlfile" - -Create a mirror directory with the same filenames except all files and -symlinks are empty files. - - cp -rs /the/source/dir mirror_dir - find mirror_dir -type l | parallel -m rm {} '&&' touch {} - -Find the files in a list that do not exist - - cat file_list | parallel 'if [ ! -e {} ] ; then echo {}; fi' - - -=head2 EXAMPLE: Composed command with perl replacement string - -You have a bunch of file. You want them sorted into dirs. The dir of -each file should be named the first letter of the file name. - - parallel 'mkdir -p {=s/(.).*/$1/=}; mv {} {=s/(.).*/$1/=}' ::: * - - -=head2 EXAMPLE: Composed command with multiple input sources - -You have a dir with files named as 24 hours in 5 minute intervals: -00:00, 00:05, 00:10 .. 23:55. You want to find the files missing: - - parallel [ -f {1}:{2} ] "||" echo {1}:{2} does not exist \ - ::: {00..23} ::: {00..55..5} - - -=head2 EXAMPLE: Calling Bash functions - -If the composed command is longer than a line, it becomes hard to -read. In Bash you can use functions. Just remember to B the -function. - - doit() { - echo Doing it for $1 - sleep 2 - echo Done with $1 - } - export -f doit - parallel doit ::: 1 2 3 - - doubleit() { - echo Doing it for $1 $2 - sleep 2 - echo Done with $1 $2 - } - export -f doubleit - parallel doubleit ::: 1 2 3 ::: a b - -To do this on remote servers you need to transfer the function using -B<--env>: - - parallel --env doit -S server doit ::: 1 2 3 - parallel --env doubleit -S server doubleit ::: 1 2 3 ::: a b - -If your environment (aliases, variables, and functions) is small you -can copy the full environment without having to -B anything. See B. - - -=head2 EXAMPLE: Function tester - -To test a program with different parameters: - - tester() { - if (eval "$@") >&/dev/null; then - perl -e 'printf "\033[30;102m[ OK ]\033[0m @ARGV\n"' "$@" - else - perl -e 'printf "\033[30;101m[FAIL]\033[0m @ARGV\n"' "$@" - fi - } - export -f tester - parallel tester my_program ::: arg1 arg2 - parallel tester exit ::: 1 0 2 0 - -If B fails a red FAIL will be printed followed by the failing -command; otherwise a green OK will be printed followed by the command. - - -=head2 EXAMPLE: Continously show the latest line of output - -It can be useful to monitor the output of running jobs. - -This shows the most recent output line until a job finishes. After -which the output of the job is printed in full: - - parallel '{} | tee >(cat >&3)' ::: 'command 1' 'command 2' \ - 3> >(perl -ne '$|=1;chomp;printf"%.'$COLUMNS's\r",$_." "x100') - - -=head2 EXAMPLE: Log rotate - -Log rotation renames a logfile to an extension with a higher number: -log.1 becomes log.2, log.2 becomes log.3, and so on. The oldest log is -removed. To avoid overwriting files the process starts backwards from -the high number to the low number. This will keep 10 old versions of -the log: - - seq 9 -1 1 | parallel -j1 mv log.{} log.'{= $_++ =}' - mv log log.1 - - -=head2 EXAMPLE: Removing file extension when processing files - -When processing files removing the file extension using B<{.}> is -often useful. - -Create a directory for each zip-file and unzip it in that dir: - - parallel 'mkdir {.}; cd {.}; unzip ../{}' ::: *.zip - -Recompress all .gz files in current directory using B running 1 -job per CPU in parallel: - - parallel "zcat {} | bzip2 >{.}.bz2 && rm {}" ::: *.gz - -Convert all WAV files to MP3 using LAME: - - find sounddir -type f -name '*.wav' | parallel lame {} -o {.}.mp3 - -Put all converted in the same directory: - - find sounddir -type f -name '*.wav' | \ - parallel lame {} -o mydir/{/.}.mp3 - - -=head2 EXAMPLE: Removing strings from the argument - -If you have directory with tar.gz files and want these extracted in -the corresponding dir (e.g foo.tar.gz will be extracted in the dir -foo) you can do: - - parallel --plus 'mkdir {..}; tar -C {..} -xf {}' ::: *.tar.gz - -If you want to remove a different ending, you can use {%string}: - - parallel --plus echo {%_demo} ::: mycode_demo keep_demo_here - -You can also remove a starting string with {#string} - - parallel --plus echo {#demo_} ::: demo_mycode keep_demo_here - -To remove a string anywhere you can use regular expressions with -{/regexp/replacement} and leave the replacement empty: - - parallel --plus echo {/demo_/} ::: demo_mycode remove_demo_here - - -=head2 EXAMPLE: Download 24 images for each of the past 30 days - -Let us assume a website stores images like: - - https://www.example.com/path/to/YYYYMMDD_##.jpg - -where YYYYMMDD is the date and ## is the number 01-24. This will -download images for the past 30 days: - - getit() { - date=$(date -d "today -$1 days" +%Y%m%d) - num=$2 - echo wget https://www.example.com/path/to/${date}_${num}.jpg - } - export -f getit - - parallel getit ::: $(seq 30) ::: $(seq -w 24) - -B<$(date -d "today -$1 days" +%Y%m%d)> will give the dates in -YYYYMMDD with B<$1> days subtracted. - - -=head2 EXAMPLE: Download world map from NASA - -NASA provides tiles to download on earthdata.nasa.gov. Download tiles -for Blue Marble world map and create a 10240x20480 map. - - base=https://map1a.vis.earthdata.nasa.gov/wmts-geo/wmts.cgi - service="SERVICE=WMTS&REQUEST=GetTile&VERSION=1.0.0" - layer="LAYER=BlueMarble_ShadedRelief_Bathymetry" - set="STYLE=&TILEMATRIXSET=EPSG4326_500m&TILEMATRIX=5" - tile="TILEROW={1}&TILECOL={2}" - format="FORMAT=image%2Fjpeg" - url="$base?$service&$layer&$set&$tile&$format" - - parallel -j0 -q wget "$url" -O {1}_{2}.jpg ::: {0..19} ::: {0..39} - parallel eval convert +append {}_{0..39}.jpg line{}.jpg ::: {0..19} - convert -append line{0..19}.jpg world.jpg - - -=head2 EXAMPLE: Download Apollo-11 images from NASA using jq - -Search NASA using their API to get JSON for images related to 'apollo -11' and has 'moon landing' in the description. - -The search query returns JSON containing URLs to JSON containing -collections of pictures. One of the pictures in each of these -collection is I. - -B is used to get the JSON for the search query. B is then -used to extract the URLs of the collections. B then calls -B to get each collection, which is passed to B to extract -the URLs of all images. B filters out the I images, and -B finally uses B to fetch the images. - - base="https://images-api.nasa.gov/search" - q="q=apollo 11" - description="description=moon landing" - media_type="media_type=image" - wget -O - "$base?$q&$description&$media_type" | - jq -r .collection.items[].href | - parallel wget -O - | - jq -r .[] | - grep large | - parallel wget - - -=head2 EXAMPLE: Download video playlist in parallel - -B is an excellent tool to download videos. It can, -however, not download videos in parallel. This takes a playlist and -downloads 10 videos in parallel. - - url='youtu.be/watch?v=0wOf2Fgi3DE&list=UU_cznB5YZZmvAmeq7Y3EriQ' - export url - youtube-dl --flat-playlist "https://$url" | - parallel --tagstring {#} --lb -j10 \ - youtube-dl --playlist-start {#} --playlist-end {#} '"https://$url"' - - -=head2 EXAMPLE: Prepend last modified date (ISO8601) to file name - - parallel mv {} '{= $a=pQ($_); $b=$_;' \ - '$_=qx{date -r "$a" +%FT%T}; chomp; $_="$_ $b" =}' ::: * - -B<{=> and B<=}> mark a perl expression. B perl-quotes the -string. B is the date in ISO8601 with time. - -=head2 EXAMPLE: Save output in ISO8601 dirs - -Save output from B every second into dirs named -yyyy-mm-ddThh:mm:ss+zz:zz. - - seq 1000 | parallel -N0 -j1 --delay 1 \ - --results '{= $_=`date -Isec`; chomp=}/' ps aux - - -=head2 EXAMPLE: Digital clock with "blinking" : - -The : in a digital clock blinks. To make every other line have a ':' -and the rest a ' ' a perl expression is used to look at the 3rd input -source. If the value modulo 2 is 1: Use ":" otherwise use " ": - - parallel -k echo {1}'{=3 $_=$_%2?":":" "=}'{2}{3} \ - ::: {0..12} ::: {0..5} ::: {0..9} - - -=head2 EXAMPLE: Aggregating content of files - -This: - - parallel --header : echo x{X}y{Y}z{Z} \> x{X}y{Y}z{Z} \ - ::: X {1..5} ::: Y {01..10} ::: Z {1..5} - -will generate the files x1y01z1 .. x5y10z5. If you want to aggregate -the output grouping on x and z you can do this: - - parallel eval 'cat {=s/y01/y*/=} > {=s/y01//=}' ::: *y01* - -For all values of x and z it runs commands like: - - cat x1y*z1 > x1z1 - -So you end up with x1z1 .. x5z5 each containing the content of all -values of y. - - -=head2 EXAMPLE: Breadth first parallel web crawler/mirrorer - -This script below will crawl and mirror a URL in parallel. It -downloads first pages that are 1 click down, then 2 clicks down, then -3; instead of the normal depth first, where the first link link on -each page is fetched first. - -Run like this: - - PARALLEL=-j100 ./parallel-crawl http://gatt.org.yeslab.org/ - -Remove the B part if you only want a web crawler. - -It works by fetching a page from a list of URLs and looking for links -in that page that are within the same starting URL and that have not -already been seen. These links are added to a new queue. When all the -pages from the list is done, the new queue is moved to the list of -URLs and the process is started over until no unseen links are found. - - #!/bin/bash - - # E.g. http://gatt.org.yeslab.org/ - URL=$1 - # Stay inside the start dir - BASEURL=$(echo $URL | perl -pe 's:#.*::; s:(//.*/)[^/]*:$1:') - URLLIST=$(mktemp urllist.XXXX) - URLLIST2=$(mktemp urllist.XXXX) - SEEN=$(mktemp seen.XXXX) - - # Spider to get the URLs - echo $URL >$URLLIST - cp $URLLIST $SEEN - - while [ -s $URLLIST ] ; do - cat $URLLIST | - parallel lynx -listonly -image_links -dump {} \; \ - wget -qm -l1 -Q1 {} \; echo Spidered: {} \>\&2 | - perl -ne 's/#.*//; s/\s+\d+.\s(\S+)$/$1/ and - do { $seen{$1}++ or print }' | - grep -F $BASEURL | - grep -v -x -F -f $SEEN | tee -a $SEEN > $URLLIST2 - mv $URLLIST2 $URLLIST - done - - rm -f $URLLIST $URLLIST2 $SEEN - - -=head2 EXAMPLE: Process files from a tar file while unpacking - -If the files to be processed are in a tar file then unpacking one file -and processing it immediately may be faster than first unpacking all -files. - - tar xvf foo.tgz | perl -ne 'print $l;$l=$_;END{print $l}' | \ - parallel echo - -The Perl one-liner is needed to make sure the file is complete before -handing it to GNU B. - - -=head2 EXAMPLE: Rewriting a for-loop and a while-read-loop - -for-loops like this: - - (for x in `cat list` ; do - do_something $x - done) | process_output - -and while-read-loops like this: - - cat list | (while read x ; do - do_something $x - done) | process_output - -can be written like this: - - cat list | parallel do_something | process_output - -For example: Find which host name in a list has IP address 1.2.3 4: - - cat hosts.txt | parallel -P 100 host | grep 1.2.3.4 - -If the processing requires more steps the for-loop like this: - - (for x in `cat list` ; do - no_extension=${x%.*}; - do_step1 $x scale $no_extension.jpg - do_step2 <$x $no_extension - done) | process_output - -and while-loops like this: - - cat list | (while read x ; do - no_extension=${x%.*}; - do_step1 $x scale $no_extension.jpg - do_step2 <$x $no_extension - done) | process_output - -can be written like this: - - cat list | parallel "do_step1 {} scale {.}.jpg ; do_step2 <{} {.}" |\ - process_output - -If the body of the loop is bigger, it improves readability to use a function: - - (for x in `cat list` ; do - do_something $x - [... 100 lines that do something with $x ...] - done) | process_output - - cat list | (while read x ; do - do_something $x - [... 100 lines that do something with $x ...] - done) | process_output - -can both be rewritten as: - - doit() { - x=$1 - do_something $x - [... 100 lines that do something with $x ...] - } - export -f doit - cat list | parallel doit - -=head2 EXAMPLE: Rewriting nested for-loops - -Nested for-loops like this: - - (for x in `cat xlist` ; do - for y in `cat ylist` ; do - do_something $x $y - done - done) | process_output - -can be written like this: - - parallel do_something {1} {2} :::: xlist ylist | process_output - -Nested for-loops like this: - - (for colour in red green blue ; do - for size in S M L XL XXL ; do - echo $colour $size - done - done) | sort - -can be written like this: - - parallel echo {1} {2} ::: red green blue ::: S M L XL XXL | sort - - -=head2 EXAMPLE: Finding the lowest difference between files - -B is good for finding differences in text files. B -gives an indication of the size of the difference. To find the -differences between all files in the current dir do: - - parallel --tag 'diff {1} {2} | wc -l' ::: * ::: * | sort -nk3 - -This way it is possible to see if some files are closer to other -files. - - -=head2 EXAMPLE: for-loops with column names - -When doing multiple nested for-loops it can be easier to keep track of -the loop variable if is is named instead of just having a number. Use -B<--header :> to let the first argument be an named alias for the -positional replacement string: - - parallel --header : echo {colour} {size} \ - ::: colour red green blue ::: size S M L XL XXL - -This also works if the input file is a file with columns: - - cat addressbook.tsv | \ - parallel --colsep '\t' --header : echo {Name} {E-mail address} - - -=head2 EXAMPLE: All combinations in a list - -GNU B makes all combinations when given two lists. - -To make all combinations in a single list with unique values, you -repeat the list and use replacement string B<{choose_k}>: - - parallel --plus echo {choose_k} ::: A B C D ::: A B C D - - parallel --plus echo 2{2choose_k} 1{1choose_k} ::: A B C D ::: A B C D - -B<{choose_k}> works for any number of input sources: - - parallel --plus echo {choose_k} ::: A B C D ::: A B C D ::: A B C D - -Where B<{choose_k}> does not care about order, B<{uniq}> cares about -order. It simply skips jobs where values from different input sources -are the same: - - parallel --plus echo {uniq} ::: A B C ::: A B C ::: A B C - parallel --plus echo {1uniq}+{2uniq}+{3uniq} ::: A B C ::: A B C ::: A B C - - -=head2 EXAMPLE: From a to b and b to c - -Assume you have input like: - - aardvark - babble - cab - dab - each - -and want to run combinations like: - - aardvark babble - babble cab - cab dab - dab each - -If the input is in the file in.txt: - - parallel echo {1} - {2} ::::+ <(head -n -1 in.txt) <(tail -n +2 in.txt) - -If the input is in the array $a here are two solutions: - - seq $((${#a[@]}-1)) | \ - env_parallel --env a echo '${a[{=$_--=}]} - ${a[{}]}' - parallel echo {1} - {2} ::: "${a[@]::${#a[@]}-1}" :::+ "${a[@]:1}" - - -=head2 EXAMPLE: Count the differences between all files in a dir - -Using B<--results> the results are saved in /tmp/diffcount*. - - parallel --results /tmp/diffcount "diff -U 0 {1} {2} | \ - tail -n +3 |grep -v '^@'|wc -l" ::: * ::: * - -To see the difference between file A and file B look at the file -'/tmp/diffcount/1/A/2/B'. - - -=head2 EXAMPLE: Speeding up fast jobs - -Starting a job on the local machine takes around 3-10 ms. This can be -a big overhead if the job takes very few ms to run. Often you can -group small jobs together using B<-X> which will make the overhead -less significant. Compare the speed of these: - - seq -w 0 9999 | parallel touch pict{}.jpg - seq -w 0 9999 | parallel -X touch pict{}.jpg - -If your program cannot take multiple arguments, then you can use GNU -B to spawn multiple GNU Bs: - - seq -w 0 9999999 | \ - parallel -j10 -q -I,, --pipe parallel -j0 touch pict{}.jpg - -If B<-j0> normally spawns 252 jobs, then the above will try to spawn -2520 jobs. On a normal GNU/Linux system you can spawn 32000 jobs using -this technique with no problems. To raise the 32000 jobs limit raise -/proc/sys/kernel/pid_max to 4194303. - -If you do not need GNU B to have control over each job (so -no need for B<--retries> or B<--joblog> or similar), then it can be -even faster if you can generate the command lines and pipe those to a -shell. So if you can do this: - - mygenerator | sh - -Then that can be parallelized like this: - - mygenerator | parallel --pipe --block 10M sh - -E.g. - - mygenerator() { - seq 10000000 | perl -pe 'print "echo This is fast job number "'; - } - mygenerator | parallel --pipe --block 10M sh - -The overhead is 100000 times smaller namely around 100 nanoseconds per -job. - - -=head2 EXAMPLE: Using shell variables - -When using shell variables you need to quote them correctly as they -may otherwise be interpreted by the shell. - -Notice the difference between: - - ARR=("My brother's 12\" records are worth <\$\$\$>"'!' Foo Bar) - parallel echo ::: ${ARR[@]} # This is probably not what you want - -and: - - ARR=("My brother's 12\" records are worth <\$\$\$>"'!' Foo Bar) - parallel echo ::: "${ARR[@]}" - -When using variables in the actual command that contains special -characters (e.g. space) you can quote them using B<'"$VAR"'> or using -"'s and B<-q>: - - VAR="My brother's 12\" records are worth <\$\$\$>" - parallel -q echo "$VAR" ::: '!' - export VAR - parallel echo '"$VAR"' ::: '!' - -If B<$VAR> does not contain ' then B<"'$VAR'"> will also work -(and does not need B): - - VAR="My 12\" records are worth <\$\$\$>" - parallel echo "'$VAR'" ::: '!' - -If you use them in a function you just quote as you normally would do: - - VAR="My brother's 12\" records are worth <\$\$\$>" - export VAR - myfunc() { echo "$VAR" "$1"; } - export -f myfunc - parallel myfunc ::: '!' - - -=head2 EXAMPLE: Group output lines - -When running jobs that output data, you often do not want the output -of multiple jobs to run together. GNU B defaults to grouping -the output of each job, so the output is printed when the job -finishes. If you want full lines to be printed while the job is -running you can use B<--line-buffer>. If you want output to be -printed as soon as possible you can use B<-u>. - -Compare the output of: - - parallel wget --limit-rate=100k \ - https://ftpmirror.gnu.org/parallel/parallel-20{}0822.tar.bz2 \ - ::: {12..16} - parallel --line-buffer wget --limit-rate=100k \ - https://ftpmirror.gnu.org/parallel/parallel-20{}0822.tar.bz2 \ - ::: {12..16} - parallel -u wget --limit-rate=100k \ - https://ftpmirror.gnu.org/parallel/parallel-20{}0822.tar.bz2 \ - ::: {12..16} - -=head2 EXAMPLE: Tag output lines - -GNU B groups the output lines, but it can be hard to see -where the different jobs begin. B<--tag> prepends the argument to make -that more visible: - - parallel --tag wget --limit-rate=100k \ - https://ftpmirror.gnu.org/parallel/parallel-20{}0822.tar.bz2 \ - ::: {12..16} - -B<--tag> works with B<--line-buffer> but not with B<-u>: - - parallel --tag --line-buffer wget --limit-rate=100k \ - https://ftpmirror.gnu.org/parallel/parallel-20{}0822.tar.bz2 \ - ::: {12..16} - -Check the uptime of the servers in I<~/.parallel/sshloginfile>: - - parallel --tag -S .. --nonall uptime - - -=head2 EXAMPLE: Colorize output - -Give each job a new color. Most terminals support ANSI colors with the -escape code "\033[30;3Xm" where 0 <= X <= 7: - - seq 10 | \ - parallel --tagstring '\033[30;3{=$_=++$::color%8=}m' seq {} - parallel --rpl '{color} $_="\033[30;3".(++$::color%8)."m"' \ - --tagstring {color} seq {} ::: {1..10} - -To get rid of the initial \t (which comes from B<--tagstring>): - - ... | perl -pe 's/\t//' - - -=head2 EXAMPLE: Keep order of output same as order of input - -Normally the output of a job will be printed as soon as it -completes. Sometimes you want the order of the output to remain the -same as the order of the input. This is often important, if the output -is used as input for another system. B<-k> will make sure the order of -output will be in the same order as input even if later jobs end -before earlier jobs. - -Append a string to every line in a text file: - - cat textfile | parallel -k echo {} append_string - -If you remove B<-k> some of the lines may come out in the wrong order. - -Another example is B: - - parallel traceroute ::: qubes-os.org debian.org freenetproject.org - -will give traceroute of qubes-os.org, debian.org and -freenetproject.org, but it will be sorted according to which job -completed first. - -To keep the order the same as input run: - - parallel -k traceroute ::: qubes-os.org debian.org freenetproject.org - -This will make sure the traceroute to qubes-os.org will be printed -first. - -A bit more complex example is downloading a huge file in chunks in -parallel: Some internet connections will deliver more data if you -download files in parallel. For downloading files in parallel see: -"EXAMPLE: Download 10 images for each of the past 30 days". But if you -are downloading a big file you can download the file in chunks in -parallel. - -To download byte 10000000-19999999 you can use B: - - curl -r 10000000-19999999 https://example.com/the/big/file >file.part - -To download a 1 GB file we need 100 10MB chunks downloaded and -combined in the correct order. - - seq 0 99 | parallel -k curl -r \ - {}0000000-{}9999999 https://example.com/the/big/file > file - - -=head2 EXAMPLE: Parallel grep - -B greps recursively through directories. GNU B can -often speed this up. - - find . -type f | parallel -k -j150% -n 1000 -m grep -H -n STRING {} - -This will run 1.5 job per CPU, and give 1000 arguments to B. - -There are situations where the above will be slower than B: - -=over 2 - -=item * - -If data is already in RAM. The overhead of starting jobs and buffering -output may outweigh the benefit of running in parallel. - -=item * - -If the files are big. If a file cannot be read in a single seek, the -disk may start thrashing. - -=back - -The speedup is caused by two factors: - -=over 2 - -=item * - -On rotating harddisks small files often require a seek for each -file. By searching for more files in parallel, the arm may pass -another wanted file on its way. - -=item * - -NVMe drives often perform better by having multiple command running in -parallel. - -=back - - -=head2 EXAMPLE: Grepping n lines for m regular expressions. - -The simplest solution to grep a big file for a lot of regexps is: - - grep -f regexps.txt bigfile - -Or if the regexps are fixed strings: - - grep -F -f regexps.txt bigfile - -There are 3 limiting factors: CPU, RAM, and disk I/O. - -RAM is easy to measure: If the B process takes up most of your -free memory (e.g. when running B), then RAM is a limiting factor. - -CPU is also easy to measure: If the B takes >90% CPU in B, -then the CPU is a limiting factor, and parallelization will speed this -up. - -It is harder to see if disk I/O is the limiting factor, and depending -on the disk system it may be faster or slower to parallelize. The only -way to know for certain is to test and measure. - - -=head3 Limiting factor: RAM - -The normal B works no matter the size of -bigfile, but if regexps.txt is so big it cannot fit into memory, then -you need to split this. - -B takes around 100 bytes of RAM and B takes about 500 -bytes of RAM per 1 byte of regexp. So if regexps.txt is 1% of your -RAM, then it may be too big. - -If you can convert your regexps into fixed strings do that. E.g. if -the lines you are looking for in bigfile all looks like: - - ID1 foo bar baz Identifier1 quux - fubar ID2 foo bar baz Identifier2 - -then your regexps.txt can be converted from: - - ID1.*Identifier1 - ID2.*Identifier2 - -into: - - ID1 foo bar baz Identifier1 - ID2 foo bar baz Identifier2 - -This way you can use B which takes around 80% less memory and -is much faster. - -If it still does not fit in memory you can do this: - - parallel --pipe-part -a regexps.txt --block 1M grep -F -f - -n bigfile | \ - sort -un | perl -pe 's/^\d+://' - -The 1M should be your free memory divided by the number of CPU threads and -divided by 200 for B and by 1000 for normal B. On -GNU/Linux you can do: - - free=$(awk '/^((Swap)?Cached|MemFree|Buffers):/ { sum += $2 } - END { print sum }' /proc/meminfo) - percpu=$((free / 200 / $(parallel --number-of-threads)))k - - parallel --pipe-part -a regexps.txt --block $percpu --compress \ - grep -F -f - -n bigfile | \ - sort -un | perl -pe 's/^\d+://' - -If you can live with duplicated lines and wrong order, it is faster to do: - - parallel --pipe-part -a regexps.txt --block $percpu --compress \ - grep -F -f - bigfile - -=head3 Limiting factor: CPU - -If the CPU is the limiting factor parallelization should be done on -the regexps: - - cat regexps.txt | parallel --pipe -L1000 --round-robin --compress \ - grep -f - -n bigfile | \ - sort -un | perl -pe 's/^\d+://' - -The command will start one B per CPU and read I one -time per CPU, but as that is done in parallel, all reads except the -first will be cached in RAM. Depending on the size of I it -may be faster to use B<--block 10m> instead of B<-L1000>. - -Some storage systems perform better when reading multiple chunks in -parallel. This is true for some RAID systems and for some network file -systems. To parallelize the reading of I: - - parallel --pipe-part --block 100M -a bigfile -k --compress \ - grep -f regexps.txt - -This will split I into 100MB chunks and run B on each of -these chunks. To parallelize both reading of I and I -combine the two using B<--cat>: - - parallel --pipe-part --block 100M -a bigfile --cat cat regexps.txt \ - \| parallel --pipe -L1000 --round-robin grep -f - {} - -If a line matches multiple regexps, the line may be duplicated. - -=head3 Bigger problem - -If the problem is too big to be solved by this, you are probably ready -for Lucene. - - -=head2 EXAMPLE: Using remote computers - -To run commands on a remote computer SSH needs to be set up and you -must be able to login without entering a password (The commands -B, B, and B may help you do that). - -If you need to login to a whole cluster, you typically do not want to -accept the host key for every host. You want to accept them the first -time and be warned if they are ever changed. To do that: - - # Add the servers to the sshloginfile - (echo servera; echo serverb) > .parallel/my_cluster - # Make sure .ssh/config exist - touch .ssh/config - cp .ssh/config .ssh/config.backup - # Disable StrictHostKeyChecking temporarily - (echo 'Host *'; echo StrictHostKeyChecking no) >> .ssh/config - parallel --slf my_cluster --nonall true - # Remove the disabling of StrictHostKeyChecking - mv .ssh/config.backup .ssh/config - -The servers in B<.parallel/my_cluster> are now added in B<.ssh/known_hosts>. - -To run B on B: - - seq 10 | parallel --sshlogin server.example.com echo - -To run commands on more than one remote computer run: - - seq 10 | parallel --sshlogin s1.example.com,s2.example.net echo - -Or: - - seq 10 | parallel --sshlogin server.example.com \ - --sshlogin server2.example.net echo - -If the login username is I on I use: - - seq 10 | parallel --sshlogin server.example.com \ - --sshlogin foo@server2.example.net echo - -If your list of hosts is I with login I: - - seq 10 | parallel -Sfoo@server{1..88}.example.net echo - -To distribute the commands to a list of computers, make a file -I with all the computers: - - server.example.com - foo@server2.example.com - server3.example.com - -Then run: - - seq 10 | parallel --sshloginfile mycomputers echo - -To include the local computer add the special sshlogin ':' to the list: - - server.example.com - foo@server2.example.com - server3.example.com - : - -GNU B will try to determine the number of CPUs on each of -the remote computers, and run one job per CPU - even if the remote -computers do not have the same number of CPUs. - -If the number of CPUs on the remote computers is not identified -correctly the number of CPUs can be added in front. Here the computer -has 8 CPUs. - - seq 10 | parallel --sshlogin 8/server.example.com echo - - -=head2 EXAMPLE: Transferring of files - -To recompress gzipped files with B using a remote computer run: - - find logs/ -name '*.gz' | \ - parallel --sshlogin server.example.com \ - --transfer "zcat {} | bzip2 -9 >{.}.bz2" - -This will list the .gz-files in the I directory and all -directories below. Then it will transfer the files to -I to the corresponding directory in -I<$HOME/logs>. On I the file will be recompressed -using B and B resulting in the corresponding file with -I<.gz> replaced with I<.bz2>. - -If you want the resulting bz2-file to be transferred back to the local -computer add I<--return {.}.bz2>: - - find logs/ -name '*.gz' | \ - parallel --sshlogin server.example.com \ - --transfer --return {.}.bz2 "zcat {} | bzip2 -9 >{.}.bz2" - -After the recompressing is done the I<.bz2>-file is transferred back to -the local computer and put next to the original I<.gz>-file. - -If you want to delete the transferred files on the remote computer add -I<--cleanup>. This will remove both the file transferred to the remote -computer and the files transferred from the remote computer: - - find logs/ -name '*.gz' | \ - parallel --sshlogin server.example.com \ - --transfer --return {.}.bz2 --cleanup "zcat {} | bzip2 -9 >{.}.bz2" - -If you want run on several computers add the computers to I<--sshlogin> -either using ',' or multiple I<--sshlogin>: - - find logs/ -name '*.gz' | \ - parallel --sshlogin server.example.com,server2.example.com \ - --sshlogin server3.example.com \ - --transfer --return {.}.bz2 --cleanup "zcat {} | bzip2 -9 >{.}.bz2" - -You can add the local computer using I<--sshlogin :>. This will disable the -removing and transferring for the local computer only: - - find logs/ -name '*.gz' | \ - parallel --sshlogin server.example.com,server2.example.com \ - --sshlogin server3.example.com \ - --sshlogin : \ - --transfer --return {.}.bz2 --cleanup "zcat {} | bzip2 -9 >{.}.bz2" - -Often I<--transfer>, I<--return> and I<--cleanup> are used together. They can be -shortened to I<--trc>: - - find logs/ -name '*.gz' | \ - parallel --sshlogin server.example.com,server2.example.com \ - --sshlogin server3.example.com \ - --sshlogin : \ - --trc {.}.bz2 "zcat {} | bzip2 -9 >{.}.bz2" - -With the file I containing the list of computers it becomes: - - find logs/ -name '*.gz' | parallel --sshloginfile mycomputers \ - --trc {.}.bz2 "zcat {} | bzip2 -9 >{.}.bz2" - -If the file I<~/.parallel/sshloginfile> contains the list of computers -the special short hand I<-S ..> can be used: - - find logs/ -name '*.gz' | parallel -S .. \ - --trc {.}.bz2 "zcat {} | bzip2 -9 >{.}.bz2" - - -=head2 EXAMPLE: Advanced file transfer - -Assume you have files in in/*, want them processed on server, -and transferred back into /other/dir: - - parallel -S server --trc /other/dir/./{/}.out \ - cp {/} {/}.out ::: in/./* - - -=head2 EXAMPLE: Distributing work to local and remote computers - -Convert *.mp3 to *.ogg running one process per CPU on local computer -and server2: - - parallel --trc {.}.ogg -S server2,: \ - 'mpg321 -w - {} | oggenc -q0 - -o {.}.ogg' ::: *.mp3 - - -=head2 EXAMPLE: Running the same command on remote computers - -To run the command B on remote computers you can do: - - parallel --tag --nonall -S server1,server2 uptime - -B<--nonall> reads no arguments. If you have a list of jobs you want -to run on each computer you can do: - - parallel --tag --onall -S server1,server2 echo ::: 1 2 3 - -Remove B<--tag> if you do not want the sshlogin added before the -output. - -If you have a lot of hosts use '-j0' to access more hosts in parallel. - - -=head2 EXAMPLE: Running 'sudo' on remote computers - -Put the password into passwordfile then run: - - parallel --ssh 'cat passwordfile | ssh' --nonall \ - -S user@server1,user@server2 sudo -S ls -l /root - - -=head2 EXAMPLE: Using remote computers behind NAT wall - -If the workers are behind a NAT wall, you need some trickery to get to -them. - -If you can B to a jumphost, and reach the workers from there, -then the obvious solution would be this, but it B: - - parallel --ssh 'ssh jumphost ssh' -S host1 echo ::: DOES NOT WORK - -It does not work because the command is dequoted by B twice where -as GNU B only expects it to be dequoted once. - -You can use a bash function and have GNU B quote the command: - - jumpssh() { ssh -A jumphost ssh $(parallel --shellquote ::: "$@"); } - export -f jumpssh - parallel --ssh jumpssh -S host1 echo ::: this works - -Or you can instead put this in B<~/.ssh/config>: - - Host host1 host2 host3 - ProxyCommand ssh jumphost.domain nc -w 1 %h 22 - -It requires B to be installed on jumphost. With this you -can simply: - - parallel -S host1,host2,host3 echo ::: This does work - -=head3 No jumphost, but port forwards - -If there is no jumphost but each server has port 22 forwarded from the -firewall (e.g. the firewall's port 22001 = port 22 on host1, 22002 = host2, -22003 = host3) then you can use B<~/.ssh/config>: - - Host host1.v - Port 22001 - Host host2.v - Port 22002 - Host host3.v - Port 22003 - Host *.v - Hostname firewall - -And then use host{1..3}.v as normal hosts: - - parallel -S host1.v,host2.v,host3.v echo ::: a b c - -=head3 No jumphost, no port forwards - -If ports cannot be forwarded, you need some sort of VPN to traverse -the NAT-wall. TOR is one options for that, as it is very easy to get -working. - -You need to install TOR and setup a hidden service. In B put: - - HiddenServiceDir /var/lib/tor/hidden_service/ - HiddenServicePort 22 127.0.0.1:22 - -Then start TOR: B - -The TOR hostname is now in B and -is something similar to B. Now you simply -prepend B to B: - - parallel --ssh 'torsocks ssh' -S izjafdceobowklhz.onion \ - -S zfcdaeiojoklbwhz.onion,auclucjzobowklhi.onion echo ::: a b c - -If not all hosts are accessible through TOR: - - parallel -S 'torsocks ssh izjafdceobowklhz.onion,host2,host3' \ - echo ::: a b c - -See more B tricks on https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Proxies_and_Jump_Hosts - - -=head2 EXAMPLE: Use sshpass with ssh - -If you cannot use passwordless login, you may be able to use B: - - export SSHPASS=MyPa$$w0rd - seq 10 | - parallel -S '4/sshpass -e ssh user-with-password@server' echo - seq 10 | - parallel --ssh 'sshpass -e ssh' -S 4/user-with-password@server' echo - - -=head2 EXAMPLE: Use outrun instead of ssh - -B lets you run a command on a remote server. B sets up -a connection to access files at the source server, and automatically -transfers files. B must be installed on the remote system. - -You can use B in an sshlogin this way: - - parallel -S 'outrun user@server' command - -or: - - parallel --ssh outrun -S server command - - -=head2 EXAMPLE: Slurm cluster - -The Slurm Workload Manager is used in many clusters. - -Here is a simple example of using GNU B to call B: - - #!/bin/bash - - #SBATCH --time 00:02:00 - #SBATCH --ntasks=4 - #SBATCH --job-name GnuParallelDemo - #SBATCH --output gnuparallel.out - - module purge - module load gnu_parallel - - my_parallel="parallel --delay .2 -j $SLURM_NTASKS" - my_srun="srun --export=all --exclusive -n1 --cpus-per-task=1 --cpu-bind=cores" - $my_parallel "$my_srun" echo This is job {} ::: {1..20} - - -=head2 EXAMPLE: Parallelizing rsync - -B is a great tool, but sometimes it will not fill up the -available bandwidth. Running multiple B in parallel can fix -this. - - cd src-dir - find . -type f | - parallel -j10 -X rsync -zR -Ha ./{} fooserver:/dest-dir/ - -Adjust B<-j10> until you find the optimal number. - -B will create the needed subdirectories, so all files are -not put into a single dir. The B<./> is needed so the resulting command -looks similar to: - - rsync -zR ././sub/dir/file fooserver:/dest-dir/ - -The B is what B works on. - -If you are unable to push data, but need to pull them and the files -are called digits.png (e.g. 000000.png) you might be able to do: - - seq -w 0 99 | parallel rsync -Havessh fooserver:src/*{}.png destdir/ - - -=head2 EXAMPLE: Use multiple inputs in one command - -Copy files like foo.es.ext to foo.ext: - - ls *.es.* | perl -pe 'print; s/\.es//' | parallel -N2 cp {1} {2} - -The perl command spits out 2 lines for each input. GNU B -takes 2 inputs (using B<-N2>) and replaces {1} and {2} with the inputs. - -Count in binary: - - parallel -k echo ::: 0 1 ::: 0 1 ::: 0 1 ::: 0 1 ::: 0 1 ::: 0 1 - -Print the number on the opposing sides of a six sided die: - - parallel --link -a <(seq 6) -a <(seq 6 -1 1) echo - parallel --link echo :::: <(seq 6) <(seq 6 -1 1) - -Convert files from all subdirs to PNG-files with consecutive numbers -(useful for making input PNG's for B): - - parallel --link -a <(find . -type f | sort) \ - -a <(seq $(find . -type f|wc -l)) convert {1} {2}.png - -Alternative version: - - find . -type f | sort | parallel convert {} {#}.png - - -=head2 EXAMPLE: Use a table as input - -Content of table_file.tsv: - - foobar - baz quux - -To run: - - cmd -o bar -i foo - cmd -o quux -i baz - -you can run: - - parallel -a table_file.tsv --colsep '\t' cmd -o {2} -i {1} - -Note: The default for GNU B is to remove the spaces around -the columns. To keep the spaces: - - parallel -a table_file.tsv --trim n --colsep '\t' cmd -o {2} -i {1} - - -=head2 EXAMPLE: Output to database - -GNU B can output to a database table and a CSV-file: - - dburl=csv:///%2Ftmp%2Fmydir - dbtableurl=$dburl/mytable.csv - parallel --sqlandworker $dbtableurl seq ::: {1..10} - -It is rather slow and takes up a lot of CPU time because GNU -B parses the whole CSV file for each update. - -A better approach is to use an SQLite-base and then convert that to CSV: - - dburl=sqlite3:///%2Ftmp%2Fmy.sqlite - dbtableurl=$dburl/mytable - parallel --sqlandworker $dbtableurl seq ::: {1..10} - sql $dburl '.headers on' '.mode csv' 'SELECT * FROM mytable;' - -This takes around a second per job. - -If you have access to a real database system, such as PostgreSQL, it -is even faster: - - dburl=pg://user:pass@host/mydb - dbtableurl=$dburl/mytable - parallel --sqlandworker $dbtableurl seq ::: {1..10} - sql $dburl \ - "COPY (SELECT * FROM mytable) TO stdout DELIMITER ',' CSV HEADER;" - -Or MySQL: - - dburl=mysql://user:pass@host/mydb - dbtableurl=$dburl/mytable - parallel --sqlandworker $dbtableurl seq ::: {1..10} - sql -p -B $dburl "SELECT * FROM mytable;" > mytable.tsv - perl -pe 's/"/""/g; s/\t/","/g; s/^/"/; s/$/"/; - %s=("\\" => "\\", "t" => "\t", "n" => "\n"); - s/\\([\\tn])/$s{$1}/g;' mytable.tsv - - -=head2 EXAMPLE: Output to CSV-file for R - -If you have no need for the advanced job distribution control that a -database provides, but you simply want output into a CSV file that you -can read into R or LibreCalc, then you can use B<--results>: - - parallel --results my.csv seq ::: 10 20 30 - R - > mydf <- read.csv("my.csv"); - > print(mydf[2,]) - > write(as.character(mydf[2,c("Stdout")]),'') - - -=head2 EXAMPLE: Use XML as input - -The show Aflyttet on Radio 24syv publishes an RSS feed with their audio -podcasts on: http://arkiv.radio24syv.dk/audiopodcast/channel/4466232 - -Using B you can extract the URLs for 2019 and download them -using GNU B: - - wget -O - http://arkiv.radio24syv.dk/audiopodcast/channel/4466232 | \ - xpath -e "//pubDate[contains(text(),'2019')]/../enclosure/@url" | \ - parallel -u wget '{= s/ url="//; s/"//; =}' - - -=head2 EXAMPLE: Run the same command 10 times - -If you want to run the same command with the same arguments 10 times -in parallel you can do: - - seq 10 | parallel -n0 my_command my_args - - -=head2 EXAMPLE: Working as cat | sh. Resource inexpensive jobs and evaluation - -GNU B can work similar to B. - -A resource inexpensive job is a job that takes very little CPU, disk -I/O and network I/O. Ping is an example of a resource inexpensive -job. wget is too - if the webpages are small. - -The content of the file jobs_to_run: - - ping -c 1 10.0.0.1 - wget http://example.com/status.cgi?ip=10.0.0.1 - ping -c 1 10.0.0.2 - wget http://example.com/status.cgi?ip=10.0.0.2 - ... - ping -c 1 10.0.0.255 - wget http://example.com/status.cgi?ip=10.0.0.255 - -To run 100 processes simultaneously do: - - parallel -j 100 < jobs_to_run - -As there is not a I the jobs will be evaluated by the shell. - - -=head2 EXAMPLE: Call program with FASTA sequence - -FASTA files have the format: - - >Sequence name1 - sequence - sequence continued - >Sequence name2 - sequence - sequence continued - more sequence - -To call B with the sequence as argument run: - - cat file.fasta | - parallel --pipe -N1 --recstart '>' --rrs \ - 'read a; echo Name: "$a"; myprog $(tr -d "\n")' - - -=head2 EXAMPLE: Call program with interleaved FASTQ records - -FASTQ files have the format: - - @M10991:61:000000000-A7EML:1:1101:14011:1001 1:N:0:28 - CTCCTAGGTCGGCATGATGGGGGAAGGAGAGCATGGGAAGAAATGAGAGAGTAGCAAGG - + - #8BCCGGGGGFEFECFGGGGGGGGG@;FFGGGEG@FF to split up -the data into blocks and pipe the blocks into the processing program. - -If the program is B you can do: - - cat bigfile | parallel --pipe --recend '' -k gzip -9 > bigfile.gz - -This will split B into blocks of 1 MB and pass that to B in parallel. One B will be run per CPU. The output of B will be kept in order and saved to B - -B works fine if the output is appended, but some processing does -not work like that - for example sorting. For this GNU B can -put the output of each command into a file. This will sort a big file -in parallel: - - cat bigfile | parallel --pipe --files sort |\ - parallel -Xj1 sort -m {} ';' rm {} >bigfile.sort - -Here B is split into blocks of around 1MB, each block ending -in '\n' (which is the default for B<--recend>). Each block is passed -to B and the output from B is saved into files. These -files are passed to the second B that runs B on the -files before it removes the files. The output is saved to -B. - -GNU B's B<--pipe> maxes out at around 100 MB/s because every -byte has to be copied through GNU B. But if B is a -real (seekable) file GNU B can by-pass the copying and send -the parts directly to the program: - - parallel --pipe-part --block 100m -a bigfile --files sort |\ - parallel -Xj1 sort -m {} ';' rm {} >bigfile.sort - - -=head2 EXAMPLE: Grouping input lines - -When processing with B<--pipe> you may have lines grouped by a -value. Here is I: - - Transaction Customer Item - 1 a 53 - 2 b 65 - 3 b 82 - 4 c 96 - 5 c 67 - 6 c 13 - 7 d 90 - 8 d 43 - 9 d 91 - 10 d 84 - 11 e 72 - 12 e 102 - 13 e 63 - 14 e 56 - 15 e 74 - -Let us assume you want GNU B to process each customer. In -other words: You want all the transactions for a single customer to be -treated as a single record. - -To do this we preprocess the data with a program that inserts a record -separator before each customer (column 2 = $F[1]). Here we first make -a 50 character random string, which we then use as the separator: - - sep=`perl -e 'print map { ("a".."z","A".."Z")[rand(52)] } (1..50);'` - cat my.csv | \ - perl -ape '$F[1] ne $l and print "'$sep'"; $l = $F[1]' | \ - parallel --recend $sep --rrs --pipe -N1 wc - -If your program can process multiple customers replace B<-N1> with a -reasonable B<--blocksize>. - - -=head2 EXAMPLE: Running more than 250 jobs workaround - -If you need to run a massive amount of jobs in parallel, then you will -likely hit the filehandle limit which is often around 250 jobs. If you -are super user you can raise the limit in /etc/security/limits.conf -but you can also use this workaround. The filehandle limit is per -process. That means that if you just spawn more GNU Bs then -each of them can run 250 jobs. This will spawn up to 2500 jobs: - - cat myinput |\ - parallel --pipe -N 50 --round-robin -j50 parallel -j50 your_prg - -This will spawn up to 62500 jobs (use with caution - you need 64 GB -RAM to do this, and you may need to increase /proc/sys/kernel/pid_max): - - cat myinput |\ - parallel --pipe -N 250 --round-robin -j250 parallel -j250 your_prg - - -=head2 EXAMPLE: Working as mutex and counting semaphore - -The command B is an alias for B. - -A counting semaphore will allow a given number of jobs to be started -in the background. When the number of jobs are running in the -background, GNU B will wait for one of these to complete before -starting another command. B will wait for all jobs to -complete. - -Run 10 jobs concurrently in the background: - - for i in *.log ; do - echo $i - sem -j10 gzip $i ";" echo done - done - sem --wait - -A mutex is a counting semaphore allowing only one job to run. This -will edit the file I and prepends the file with lines with the -numbers 1 to 3. - - seq 3 | parallel sem sed -i -e '1i{}' myfile - -As I can be very big it is important only one process edits -the file at the same time. - -Name the semaphore to have multiple different semaphores active at the -same time: - - seq 3 | parallel sem --id mymutex sed -i -e '1i{}' myfile - - -=head2 EXAMPLE: Mutex for a script - -Assume a script is called from cron or from a web service, but only -one instance can be run at a time. With B and B<--shebang-wrap> -the script can be made to wait for other instances to finish. Here in -B: - - #!/usr/bin/sem --shebang-wrap -u --id $0 --fg /bin/bash - - echo This will run - sleep 5 - echo exclusively - -Here B: - - #!/usr/bin/sem --shebang-wrap -u --id $0 --fg /usr/bin/perl - - print "This will run "; - sleep 5; - print "exclusively\n"; - -Here B: - - #!/usr/local/bin/sem --shebang-wrap -u --id $0 --fg /usr/bin/python - - import time - print "This will run "; - time.sleep(5) - print "exclusively"; - - -=head2 EXAMPLE: Start editor with filenames from stdin (standard input) - -You can use GNU B to start interactive programs like emacs or vi: - - cat filelist | parallel --tty -X emacs - cat filelist | parallel --tty -X vi - -If there are more files than will fit on a single command line, the -editor will be started again with the remaining files. - - -=head2 EXAMPLE: Running sudo - -B requires a password to run a command as root. It caches the -access, so you only need to enter the password again if you have not -used B for a while. - -The command: - - parallel sudo echo ::: This is a bad idea - -is no good, as you would be prompted for the sudo password for each of -the jobs. You can either do: - - sudo echo This - parallel sudo echo ::: is a good idea - -or: - - sudo parallel echo ::: This is a good idea - -This way you only have to enter the sudo password once. - - -=head2 EXAMPLE: GNU Parallel as queue system/batch manager - -GNU B can work as a simple job queue system or batch manager. -The idea is to put the jobs into a file and have GNU B read -from that continuously. As GNU B will stop at end of file we -use B to continue reading: - - true >jobqueue; tail -n+0 -f jobqueue | parallel - -To submit your jobs to the queue: - - echo my_command my_arg >> jobqueue - -You can of course use B<-S> to distribute the jobs to remote -computers: - - true >jobqueue; tail -n+0 -f jobqueue | parallel -S .. - -Output only will be printed when reading the next input after a job -has finished: So you need to submit a job after the first has finished -to see the output from the first job. - -If you keep this running for a long time, jobqueue will grow. A way of -removing the jobs already run is by making GNU B stop when -it hits a special value and then restart. To use B<--eof> to make GNU -B exit, B also needs to be forced to exit: - - true >jobqueue; - while true; do - tail -n+0 -f jobqueue | - (parallel -E StOpHeRe -S ..; echo GNU Parallel is now done; - perl -e 'while(<>){/StOpHeRe/ and last};print <>' jobqueue > j2; - (seq 1000 >> jobqueue &); - echo Done appending dummy data forcing tail to exit) - echo tail exited; - mv j2 jobqueue - done - -In some cases you can run on more CPUs and computers during the night: - - # Day time - echo 50% > jobfile - cp day_server_list ~/.parallel/sshloginfile - # Night time - echo 100% > jobfile - cp night_server_list ~/.parallel/sshloginfile - tail -n+0 -f jobqueue | parallel --jobs jobfile -S .. - -GNU B discovers if B or B<~/.parallel/sshloginfile> -changes. - - -=head2 EXAMPLE: GNU Parallel as dir processor - -If you have a dir in which users drop files that needs to be processed -you can do this on GNU/Linux (If you know what B is -called on other platforms file a bug report): - - inotifywait -qmre MOVED_TO -e CLOSE_WRITE --format %w%f my_dir |\ - parallel -u echo - -This will run the command B on each file put into B or -subdirs of B. - -You can of course use B<-S> to distribute the jobs to remote -computers: - - inotifywait -qmre MOVED_TO -e CLOSE_WRITE --format %w%f my_dir |\ - parallel -S .. -u echo - -If the files to be processed are in a tar file then unpacking one file -and processing it immediately may be faster than first unpacking all -files. Set up the dir processor as above and unpack into the dir. - -Using GNU B as dir processor has the same limitations as -using GNU B as queue system/batch manager. - - -=head2 EXAMPLE: Locate the missing package - -If you have downloaded source and tried compiling it, you may have seen: - - $ ./configure - [...] - checking for something.h... no - configure: error: "libsomething not found" - -Often it is not obvious which package you should install to get that -file. Debian has `apt-file` to search for a file. `tracefile` from -https://gitlab.com/ole.tange/tangetools can tell which files a program -tried to access. In this case we are interested in one of the last -files: - - $ tracefile -un ./configure | tail | parallel -j0 apt-file search +See: B =head1 SPREADING BLOCKS OF DATA diff --git a/src/pod2graph b/src/pod2graph index 36526943..a7be1001 100755 --- a/src/pod2graph +++ b/src/pod2graph @@ -30,8 +30,11 @@ # to a graph.pdf with link between --option and --other-option $pod=join("",<>); +# Remove stuff before OPTIONS $pod=~s/^.*=head1 OPTIONS//s; +# Remove from EXAMPLES (which is next section) and till end $pod=~s/=head1 EXAMPLES.*//s; +# Remove =over / =back pairs $pod=~s/^.*?=over//s; $pod=~s/=back\s*$//s; $pod=~s/=over.*?=back//sg; @@ -40,19 +43,21 @@ $in_text = 0; $in_item = 0; $in_see_also = 0; - for(split(/\n\n+/,$pod)) { if(/^See also:\s+(\S.*)/s) { + # "See also" paragraph $lex = "seealso"; $in_text = 0; $in_item = 0; $in_see_only = 1; } elsif(/^=item\s+(B<[{]=.*?perl expression.*?=[}]>|[IB]<.*?>)(\s|$)/s) { + # "=item" paragraph $lex = "item"; $in_text = 0; $in_item = 1; $in_see_only = 0; } elsif(/\S/) { + # else it is just text $lex = "text"; $in_text = 1; $in_item = 0; @@ -60,11 +65,13 @@ for(split(/\n\n+/,$pod)) { } if($lex eq "seealso") { + # We found "See also": output edge if($lastlex eq "item") { @saveditems = @items; @items = (); } my $to = $1; + # Edge from = item/item/item my $from = (join "/", map { s/I<(.*?)>/$1/g;