From 5298af094dfd6f3ffe3447ac029fb036de7e6f4b Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Wed, 27 Jul 2011 19:34:30 +0200 Subject: [PATCH] parallel man page: parallel web spider --- doc/release_new_version | 43 ++++------------------------------------- src/parallel.pod | 31 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 39 deletions(-) diff --git a/doc/release_new_version b/doc/release_new_version index d244570c..9aa31121 100644 --- a/doc/release_new_version +++ b/doc/release_new_version @@ -177,50 +177,15 @@ cc:Peter Simons , Sandro Cazzaniga , Christian Faulhammer , Ryoichiro Suzuki , Jesse Alama -Subject: GNU Parallel 20110722 ('Murdoch') released +Subject: GNU Parallel 20110822 ('Utøya') released -GNU Parallel 20110722 ('Murdoch') has been released. It is +GNU Parallel 20110822 ('Utøya') has been released. It is available for download at: http://ftp.gnu.org/gnu/parallel/ New in this release: -* niceload: --hard will suspend a program if a limit is reached - as - opposed to just slowing the program down. - -* niceload: --soft will slow the program down - as opposed to - suspending the program completely. - -* niceload: --run-io will slow down a program if disk io goes above a - certain limit. - -* niceload: --run-load will slow down a program if loadaverage goes - above a certain limit. - -* niceload: --run-mem will slow down a program if free memory goes - below a certain limit. - -* niceload: --run-noswap will slow down a program if the computer is - swapping. - -* niceload: --start-io, --start-load, --start-mem, --start-noswap will - defer starting a program until the system is below the limit. - -* --io, --load, --mem, and --noswap sets both --run-* and --start-*. - -* niceload got a major rewrite and is now object oriented. - -* GNU Parallel was presented at Nordic Perl Workshop 2011. - http://conferences.yapceurope.org/npw2011/talk/3416 - -* Blog post about zcat and GNU Parallel. Thanks to Dr. John. - http://drjohnstechtalk.com/blog/2011/06/gnu-parallel-really-helps-with-zcat/ - -* 2 blog posts in Japanese. Thanks to Negima. - http://d.hatena.ne.jp/negima1976/20110607/1307412660 - http://d.hatena.ne.jp/negima1976/20110628/1309252494 - -* Blog post for bioinformatics. Thanks to Chris Miller. - http://chrisamiller.com/science/2010/05/26/use-parallel-for-easy-multi-processor-execution/ +* Blog post about optimizing JPEGs. Thanks to Thomas Jost. + http://schnouki.net/2011/07/22/optimizing-jpeg-pictures/ * Bug fixes and man page updates. diff --git a/src/parallel.pod b/src/parallel.pod index 1a46248a..13111e9d 100644 --- a/src/parallel.pod +++ b/src/parallel.pod @@ -1523,6 +1523,37 @@ B<$(date -d "today -{1} days" +%Y%m%d)> with give the dates in YYYYMMDD with {1} days subtracted. +=head1 EXAMPLE: Parallel spider + +This script below will spider a URL in parallel (breadth first). Run +like this: + +B + + #!/bin/bash + + # E.g. http://www.gnu.org/software/parallel + URL=$1 + URLLIST=$(mktemp urllist.XXXX) + URLLIST2=$(mktemp urllist.XXXX) + SEEN=$(mktemp seen.XXXX) + + # Spider to get the URLs + echo $URL >$URLLIST + cp $URLLIST $SEEN + + while [ -s $URLLIST ] ; do + cat $URLLIST | + parallel lynx -listonly -image_links -dump {} \; echo Spidered: {} \>\&2 | + perl -ne 's/#.*//; s/\s+\d+.\s(\S+)$/$1/ and do { $seen{$1}++ or print }' | + grep -F $URL | + grep -v -x -F -f $SEEN | tee -a $SEEN > $URLLIST2 + mv $URLLIST2 $URLLIST + done + + rm -f $URLLIST $URLLIST2 $SEEN + + =head1 EXAMPLE: Process files from a tar file while unpacking If the files to be processed are in a tar file then unpacking one file