Import sql inputfile tabel, Split colonner til {n} sql :foo 'select * from bar' | parallel --colsep '\s+\|\s+' do_stuff {4} {1} parallel -a table_file --colsep '\s+' do_stuff {4} {1} if(defined $::opt_colsep and defined @::opt_a and @::opt_a > 1) { # must be done after converting :::: to -a -a warn("--colsep incompatible with multiple argument files. Ignoring --colsep"); $::opt_colsep = undef; } if($::opt_colsep) { # read input either from -a or from stdin my $max_cols = 0; my @table; my $lineno = 0; $Global::input_is_filename = 0; # cheat get_next_arg into not quoting while get_next_arg { my @cols = split /$::opt_colsep/o, $_; if ($Global::trim) { for(@cols) { s/^\s+//; s/\s+$//; } } $max_cols = max($#cols+1,$max_cols); @table[$lineno++] = @cols; } $::opt_N = $max_cols; for ($lineno = 0; $lineno <= $#table; $lineno++) { if (not defined $table[$lineno][$max_col-1]) { # Make sure the table has the same columns for all rows $table[$lineno][$max_col-1] = ""; } unget_arg(@table[$lineno]); } } --autocolsep: Læs alle linjer. Prøv fastlængde: Find tegn, som står i alle linjer på de samme pladser. Risiko for falske pos Prøv fieldsep: Find eet tegn, som optræder det samme antal gange i alle linjer (tab sep) Prøv klyngesep: Find den samme klynge tegn, som står samme antal gange i alle linjer (' | ' sep) Fjern whitespace før og efter colonne hvis der er n af tegn A og 2n af tegn B, så a | b | c Simpleste: tab sep for hver linje max,min count for hver char for hver char if max == min : potentiel min_potentiel = min(min_potentiel,min) for potentiel: if min % min_potentiel = 0: sepchars += potentiel,no of sepchars += min / min_potentiel colsep = [sepchars]{no_of_sepchars} # Hvordan udregnes system limits på remote systems hvis jeg ikke ved, hvormange # argumenter, der er? Lav system limits lokalt og lad det være max # TODO max_line_length on remote # TODO compute how many can be transferred within max_line_length # TODO Unittest with filename that is long and requires a lot of quoting. Will there be to many # TODO --max-number-of-jobs print the system limited number of jobs # TODO Debian package # TODO to kill from a run script parallel should set PARALLEL_PID that can be sig termed # TAGS: parallel | parallel processing | multicore | multiprocessor | Clustering/Distributed Networks # job control | multiple jobs | parallelization | text processing | cluster | filters # Clustering Tools | Command Line Tools | Utilities | System Administration # Bash parallel =head1 YouTube video2 Converting of WAV files to MP3 using GNU Parallel # Run one jobs per CPU core # For 'foo.wav' call the output file 'foo.mp3' find music-files -type f | parallel -j+0 lame {} -o {.}.mp3 # Run one jobs per CPU core # Run on local computer + 2 remote computers # Give us progress information # For 'foo.wav' call the output file 'foo.mp3' find music-files -type f | parallel -j+0 -S :,computer1.examle.com,computer2.example.com \ --eta --trc {.}.mp3 lame {} -o {.}.mp3 =head1 YouTube video GNU Parallel is a tool with lots of uses in shell. Every time you use xargs or a for-loop GNU Parallel can probably do that faster, safer and more readable. If you have access to more computers through ssh, GNU Parallel makes it easy to distribute jobs to these. terminal2: ssh parallel@vh2.pi.dk ssh parallel@vh2.pi.dk and PS1="\[\e[7m\]GNU Parallel:\[\033[01;34m\]\w\[\033[00m\e[27m\]$ " gunzip logs/*gz rm -f logs/*bz2* rm -rf zip/*[^p] rm -rf dirs/* rm -rf parallel-*bz2 xvidcap ffmpeg -i 20100616_002.mp4 -ab 320k -ar 44100 speak.mp3 # Merge video using youtube #ffmpeg -i speak.mp3 -i xvidcap.mpeg -target mpeg -hq -minrate 8000000 \ #-title "GNU Parallel" -author "Ole Tange" -copyright "(CC-By-SA) 2010" -comment "Intro video of GNU Parallel 20100616" videoaudio.mpg # GNU PARALLEL - BASIC USAGE # A GNU tool for parallelizing shell commands ## Ole Tange Author # GET GNU PARALLEL wget ftp://ftp.gnu.org/gnu/parallel/parallel-20100620.tar.bz2 tar xjf parallel-20100620.tar.bz2 cd parallel-20100620 ./configure && make ## su make install exit cd ## scp /usr/local/bin/parallel root@parallel:/usr/local/bin/ # YOUR FIRST PARALLEL JOBS cd logs du /usr/bin/time gzip -1 * ## 24 sek - 22 sek /usr/bin/time gunzip * ## 24 sek - 18 ls | time parallel gzip -1 ## 17 sek - 10 ls | time parallel gunzip ## 25 sek - 19 # RECOMPRESS gz TO bz2 ls | time parallel gzip -1 ls *.gz | time parallel -j+0 --eta 'zcat {} | bzip2 -9 >{.}.bz2' ## Explain command line ## vis top local ## Man that is boring ## 2m41s - 2m - 3m35s # RECOMPRESS gz TO bz2 USING local(:) AND REMOTE server1-4 ls *.gz |time parallel -j+0 --eta -Sserver1,server2,server3,server4,: \ --transfer --return {.}.bz2 --cleanup 'zcat {} | bzip2 -9 > {.}.bz2' ## Explain command line ## Explain server config ## vis top local ## vis top remote1-3 ## 49 sek # RECOMPRESS gz TO bz2 USING A SCRIPT ON local AND REMOTE server1-2,4 # (imagine the script is way more complex) cp ../recompress /tmp cat /tmp/recompress ls *.gz |time parallel -j+0 --progress -Sserver1,server2,server4,: \ --trc {.}.bz2 --basefile /tmp/recompress '/tmp/recompress {} {.}.bz2' # MAKING SMALL SCRIPTS cd ../zip ls -l ls *.zip | parallel 'mkdir {.} && cd {.} && unzip ../{}' ### ls -l # GROUP OUTPUT traceroute debian.org traceroute debian.org & traceroute freenetproject.org ### (echo debian.org; echo freenetproject.org) | parallel traceroute ### # KEEP ORDER (echo debian.org; echo freenetproject.org) | parallel -k traceroute ### # RUN MANY JOBS. USE OUTPUT # Find the number of hosts responding to ping ping -c 1 178.63.11.1 ping -c 1 178.63.11.1 | grep '64 bytes' seq 1 255 | parallel -j255 ping -c 1 178.63.11.{} 2>&1 \ | grep '64 bytes' | wc -l seq 1 255 | parallel -j0 ping -c 1 178.63.11.{} 2>&1 \ | grep '64 bytes' | wc -l # MULTIPLE ARGUMENTS # make dir: test-(1-5000).dir cd ../dirs rm -rf *; sync seq 1 10 | parallel echo mkdir test-{}.dir seq 1 5000 | time parallel mkdir test-{}.dir ## 15 sek rm -rf *; sync seq 1 10 | parallel -X echo mkdir test-{}.dir seq 1 5000 | time parallel -X mkdir test-{}.dir # CALLING GNU PARALLEL FROM GNU PARALLEL # make dir: top-(1-100)/sub-(1-100) rm -rf *; sync seq 1 100 | time parallel -I @@ \ 'mkdir top-@@; seq 1 100 | parallel -X mkdir top-@@/sub-{}' find | wc -l cd # Thank you for watching # # If you like GNU Parallel: # * Post this video on your blog/Twitter/Facebook/Linkedin # * Join the mailing list http://lists.gnu.org/mailman/listinfo/parallel # * Request or write a review for your favourite magazine # * Request or build a package for your favourite distribution # * Invite me for your next conference (Contact http://ole.tange.dk) # # If GNU Parallel saves you money: # * Donate to FSF https://my.fsf.org/donate/ # # Find GNU Parallel at http://www.gnu.org/software/parallel/ # GIVE ME THE FIRST RESULT (echo foss.org.my; echo debian.org; echo freenetproject.org) | parallel -H2 traceroute {}";false" find . -type f | parallel -k -j150% -n 1000 -m grep -H -n STRING {} (echo foss.org.my; echo debian.org; echo freenetproject.org) | parallel traceroute =head1 IDEAS Kan vi lave flere ssh'er, hvis vi venter lidt? En ssh med 20% loss og 900 ms delay, så kan login nås på 15 sek. Test if -0 works on filenames ending in '\n' If there are nomore jobs (STDIN is eof) then make sure to distribute the arguments evenly if running -X. =head1 search terms GNU parallel execution shell bash script simultaneous concurrent linux scripting run xargs ppss code.google.com/p/ppss/ =head1 options One char options not used: F G J K P Q Y Skilletegn i sshlogin: #=item B<--sshlogin> I<[ncpu/]sshlogin[,[ncpu/]sshlogin[,...]]> (beta testing) # Skilletegn: # No: "#!&()?\<>|;*'~ shellspecial # No: @.- part of user@i.p.n.r i.p.n.r host-name # No: , separates different sshlogins # No: space Will make it hard to do: 8/server1,server2 # Maybe: / 8//usr/bin/myssh,//usr/bin/ssh # %/=:_^ =head2 mutex mutex -b -n -l lockid -m max_locks [command] mutex -u lockid -b run command in background -l lockfile will lock using the lockid -n nonblocking -m maximal number of locks (default 1) -u unlock If command given works like: mutex -l lockfile -n number_of_locks ; command; mutex -u lockfile If -b given works like: mutex -l lockfile -n number_of_locks ; (command; mutex -u lockfile)& Kan vi finde på lockid som giver mening? Parallelize so this can be done: mdm.screen find dir -execdir mdm-run cmd {} \; Maybe: find dir -execdir par$ --communication-file /tmp/comfile cmd {} \; find dir -execdir mutex -j4 -b cmd {} \; =head2 Comfile This will put a lock on /tmp/comfile. The number of locks is the number of running commands. If the number is smaller than -j then it will start a process in the background ( cmd & ), otherwise wait. par$ --wait /tmp/comfile will wait until no more locks on the file =head1 Unlikely Accept signal INT instead of TERM to complete current running jobs but do not start new jobs. Print out the number of jobs waiting to complete on STDERR. Accept sig INT again to kill now. This seems to be hard, as all foreground processes get the INT from the shell.