Merge branch 'master' of ssh://git.sv.gnu.org/srv/git/parallel

Conflicts:
	src/parallel
	testsuite/tests-to-run/parallel-local2.sh
This commit is contained in:
Ole Tange 2012-10-17 02:16:01 +02:00
commit d9675d0bd0
26 changed files with 353 additions and 19 deletions

6
src/optional/genresults.sh Executable file
View file

@ -0,0 +1,6 @@
#!/bin/bash
#
# Generate the result files used to test the query modules.
../parallel --header : --result testresults/foo_ echo {a} {b} ::: a 1 2 ::: b 0.30 0.40
../parallel --header : --result testresults/bar_ echo {a} {b} ::: a 5 6 ::: b 0.70 0.80

4
src/optional/python/.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
*.pyc
build
dist
*.egg-info

View file

@ -0,0 +1,40 @@
gnuparallel : Simple loading of GNU parallel result files.
The gnuparallel package provides a single function, `load`, which
loads results from files generated by GNU parallel into a Pandas
DataFrame object. See `help(gnuparallel.load)` for details.
Installation:
# python setup.py install
Sample usage:
1. Generate some results files by running parallel from the command line:
# mkdir outdir
# parallel --results outdir/ echo {arg1} {arg2} ::: arg1 1 2 ::: arg2 three four
2. Load the results using the gnuparallel Python package:
# python
Python 2.7.3 (default, Apr 24 2012, 00:00:54)
[GCC 4.7.0 20120414 (prerelease)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import gnuparallel
>>> help(gnuparallel.load)
>>> myresults = gnuparallel.load('outdir')
>>> myresults
1 2 _prefix _stream resfile
0 2 four NaN stdout outdir/stdout\t1\t2\t2\tfour
1 2 three NaN stdout outdir/stdout\t1\t2\t2\tthree
2 2 arg2 NaN stdout outdir/stdout\t1\t2\t2\targ2
3 1 four NaN stdout outdir/stdout\t1\t1\t2\tfour
4 1 three NaN stdout outdir/stdout\t1\t1\t2\tthree
5 1 arg2 NaN stdout outdir/stdout\t1\t1\t2\targ2
6 arg1 four NaN stdout outdir/stdout\t1\targ1\t2\tfour
7 arg1 three NaN stdout outdir/stdout\t1\targ1\t2\tthree
8 arg1 arg2 NaN stdout outdir/stdout\t1\targ1\t2\targ2
See documentation for the pandas project (http://pandas.pydata.org/) for
instructions on how to access and manipulate the loaded results.

View file

@ -0,0 +1,3 @@
"""Load GNU Parallel --results files into a Pandas DataFrame."""
from _loader import *

View file

@ -0,0 +1,147 @@
"""
A function for loading the --result files generated by GNU Parallel.
"""
__all__ = ['load']
from cStringIO import StringIO
import pandas as pd
import os
def load(_dir, _process=None, _format=None, _stream='stdout',
_prefix=None, _infer_types=True, **options):
"""Load files generated with parallel's --result option.
One use of GNU parallel is to call one command many times, each
time with a different set of arguments. With the --result option,
parallel will capture stdout and stderr from these processes and
store them in files named according to the arguments of each
individual call. This function provides easy loading of these
result files into a Pandas DataFrame.
Parameters
----------
_dir : str
Directory containing the results files.
_process : function, optional
Function that opens a results file and returns an object containing
its results. If not provided, the resulting data frame will include
a column containing the file names, not the actual results.
If provided, the function should take a filename as its sole parameter.
Whatever the function returns will be stored in the "res" column of
the resulting DataFrame.
_format : dict, optional
Dictionary of format strings, used to convert any provided filter
values to a format matching the results file names.
For example, if the `foo` parameter to parallel was "0.10" and you pass
foo=0.10 as an option, you will not find the intended file because
str(0.10) == "0.1". To fix this, you should also include the key-value
pair "foo": "%.2f" in the _format dict. This is usually only necessary
for float-valued arguments where rounding or precision issues might
affect the matching process.
_stream : str, optional
Specify either "stdout" or "stderr" to load results files from the
corresponding stream. Default is "stdout".
_prefix : str, optional
Only load result files with a specific prefix. When using the --result
option to parallel it is possible to specify a prefix for all of the
result files. For example,
parallel --result /some/dir/a_prefix ...
would place all result files into the `/some/dir` directory and all of
the file names would begin with "a_prefix". This parameter lets you
filter based on this prefix. If None, allow any prefix. Default None.
_infer_types : bool, optional
Infer data types for option values. All option values are techinically
strings (since they were passed on the command line). When _infer_types
is True, the resulting DataFrame will convert these values to inferred
dtypes, e.g. the number 1 instead of "1". Default True.
**options : kwargs
Additional keyword arguments that will be used to filter the subset
of results included in the output. The values can be either single
values or iterables. If they are iterable, files corresponding to any
of the included values will be considered a match.
For example, passing `foo=[1,2,3]` will include results from files
corresponding to runs where the parallel argument named `foo` had
the value "1", "2", or "3".
See also the _format parameter.
Returns
-------
res : pandas.DataFrame
A DataFrame with one column named for each of the parallel arguments
and, depending on the _process argument, either:
- A "res" column containing the results corresponding to each run.
- A "resfile" column containing the names of the results files.
"""
if _format is None:
_format = dict()
# Process the filter options.
for k,v in options.iteritems():
if hasattr(v, '__iter__') and not isinstance(v, basestring):
pass # v is already a container type.
else:
options[k] = [v]
for k,v in options.iteritems():
options[k] = set(_stringify(x, _format.get(k, '')) for x in v)
options['_stream'] = [_stream]
if _prefix:
options['_prefix'] = [_prefix]
# Iterate over results files and collect the matches.
matches = []
for file in os.listdir(_dir):
metadata = _parse_name(file)
metadata['resfile'] = os.path.join(_dir, metadata['resfile'])
if _select(metadata, options):
matches.append(metadata)
# Create a DataFrame from the matches.
df = pd.DataFrame(matches)
# Optionally try to convert string argument values to numeric types.
if _infer_types:
buf = StringIO()
df.to_csv(buf)
df = pd.read_csv(StringIO(buf.getvalue()), index_col=0)
# Open and process the results. This needs to happen after the type
# infererence phase since the processed results can be arbitrary
# Python objects and might not survive the round-trip.
if _process and not df.empty:
df['res'] = df.resfile.apply(_process)
df = df.drop('resfile', axis=1)
return df
def _parse_name(file, sep='\t'):
"""Return a dict containing metadata extracted from the file name."""
tokens = file.split(sep)
prefix_stream = tokens[0]
metadata = {k:v for k,v in zip(tokens[1::2], tokens[2::2])}
stream_index = prefix_stream.find('stdout')
if stream_index == -1:
stream_index = prefix_stream.find('stderr')
prefix, stream = prefix_stream[:stream_index], prefix_stream[stream_index:]
metadata.update({'_prefix': prefix, '_stream': stream, 'resfile': file})
return metadata
def _select(metadata, filter):
"""Return true if the metadata entry matches the filter, False otherwise."""
if any(k not in metadata for k in filter):
return False
if any(all(v != metadata[k] for v in vs) for k,vs in filter.iteritems()):
return False
return True
def _stringify(x, fmt):
"""Return the string representation of x, using a format string if provided"""
if fmt:
return fmt % x
else:
return str(x)

15
src/optional/python/setup.py Executable file
View file

@ -0,0 +1,15 @@
#!/usr/bin/env python
from distutils.core import setup
setup(
name = 'gnuparallel',
version = '0.1',
description = 'Load GNU parallel result files.',
author = 'Drew Frank',
author_email = 'drewfrank@gmail.com',
packages = [
'gnuparallel'
],
install_requires = ['pandas']
)

View file

@ -0,0 +1,66 @@
import pandas as pd
import unittest
from gnuparallel import load
result_dir = '../../testresults'
class TestLoader(unittest.TestCase):
def test_basics(self):
df = load(result_dir)
self.assertEqual(set(df.columns), set(['a', 'b', '_prefix', 'resfile', '_stream']))
self.assertEqual(df.shape[0], 8)
def test_prefix(self):
df = load(result_dir, _prefix='foo_')
self.assertEqual(df.shape[0], 4)
self.assertEqual(df.a.sum(), 6)
df = load(result_dir, _prefix='bar_')
self.assertEqual(df.shape[0], 4)
self.assertEqual(df.a.sum(), 22)
df = load(result_dir, _prefix='BAD')
self.assertTrue(df.empty)
def test_filters(self):
df = load(result_dir, a=2)
self.assertEqual(df.shape[0], 2)
self.assertEqual(df.a.sum(), 4)
df = load(result_dir, a=[2])
self.assertEqual(df.shape[0], 2)
self.assertEqual(df.a.sum(), 4)
df = load(result_dir, a=[1,2])
self.assertEqual(df.shape[0], 4)
self.assertEqual(df.a.sum(), 6)
df = load(result_dir, a=1000)
self.assertTrue(df.empty)
def test_infer_types(self):
df = load(result_dir)
self.assertEqual(df.a.dtype, pd.np.int64)
df = load(result_dir, _infer_types=False)
self.assertEqual(df.a.dtype, pd.np.object_)
def test_format(self):
df = load(result_dir, b=0.3)
self.assertTrue(df.empty)
df = load(result_dir, b=0.3, _format={'b': '%.2f'})
self.assertEqual(df.shape[0], 2)
def test_stream(self):
df = load(result_dir, _stream='stderr')
self.assertTrue((df._stream == 'stderr').all())
def test_process(self):
df = load(result_dir, a=1, _process=lambda x: pd.np.loadtxt(x).sum())
self.assertAlmostEqual(df.res[0], 1.4)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1 @@
5 0.70

View file

@ -0,0 +1 @@
5 0.80

View file

@ -0,0 +1 @@
6 0.70

View file

@ -0,0 +1 @@
6 0.80

View file

@ -0,0 +1 @@
1 0.30

View file

@ -0,0 +1 @@
1 0.40

View file

@ -0,0 +1 @@
2 0.30

View file

@ -0,0 +1 @@
2 0.40

View file

@ -445,6 +445,15 @@ If I<eof-str> is omitted, there is no end of file string. If neither
B<-E> nor B<-e> is used, no end of file string is used.
=item B<--env> I<var>
Copy environment variable I<var>. This will copy I<var> to the
environment that the command is run in. This is especially useful for
remote environments.
Caveat: If I<var> contains newline ('\n') the value is messed up.
=item B<--eta>
Show the estimated number of seconds before finishing. This forces GNU
@ -979,9 +988,9 @@ Only used with B<--pipe>.
Results in files named by tab separated arguments. Save the output
into files. The file names will be prefixed with I<prefix> which can
contain a path with a prefix string. The file with output from stdout
(standard output) will prefixed with 'I<prefix>stdout_'. The file
(standard output) will prefixed with 'I<prefix>stdout'. The file
with output from stderr (standard error) will prefixed with
'I<prefix>stderr_'.
'I<prefix>stderr'.
The postfix is the header of the input source (if using B<--header :>)
or the number of the input source followed by the value of the input
@ -994,14 +1003,14 @@ E.g:
will generate the files:
foo/barstderr_a I b III
foo/barstderr_a I b IIII
foo/barstderr_a II b III
foo/barstderr_a II b IIII
foo/barstdout_a I b III
foo/barstdout_a I b IIII
foo/barstdout_a II b III
foo/barstdout_a II b IIII
foo/barstderr a I b III
foo/barstderr a I b IIII
foo/barstderr a II b III
foo/barstderr a II b IIII
foo/barstdout a I b III
foo/barstdout a I b IIII
foo/barstdout a II b III
foo/barstdout a II b IIII
and
@ -1009,14 +1018,14 @@ and
will generate the files:
foo/barstderr_1 I 2 III
foo/barstderr_1 I 2 IIII
foo/barstderr_1 II 2 III
foo/barstderr_1 II 2 IIII
foo/barstdout_1 I 2 III
foo/barstdout_1 I 2 IIII
foo/barstdout_1 II 2 III
foo/barstdout_1 II 2 IIII
foo/barstderr 1 I 2 III
foo/barstderr 1 I 2 IIII
foo/barstderr 1 II 2 III
foo/barstderr 1 II 2 IIII
foo/barstdout 1 I 2 III
foo/barstdout 1 I 2 IIII
foo/barstdout 1 II 2 III
foo/barstdout 1 II 2 IIII
where all spaces are TABs (\t);.
@ -1850,6 +1859,16 @@ This also works if the input file is a file with columns:
cat addressbook.tsv | parallel --colsep '\t' --header : echo {Name} {E-mail address}
=head1 EXAMPLE: Count the differences between all files in a dir
Using B<--results> the results are saved in /tmp/diffcount*.
parallel --results /tmp/diffcount "diff -U 0 {1} {2} |tail -n +3 |grep -v '^@'|wc -l" ::: * ::: *
To see the difference between file A and file B look at the file
'/tmp/diffcount 1 A 2 B' where spaces are TABs (\t).
=head1 EXAMPLE: Speeding up fast jobs
Starting a job on the local machine takes around 3 ms. This can be a

View file

@ -52,9 +52,26 @@ perl -e 'for(160) { printf "%c%c %c%d\0",$_,$_,$_,$_ }' | stdout parallel --nice
echo '### Test too slow spawning'
killall -9 burnP6 2>/dev/null
seq 1 2 | parallel -j2 -N0 timeout -k 25 26 burnP6 &
seq `parallel --number-of-cores` | parallel -j100% -N0 timeout -k 25 26 burnP6 &
sleep 1
seq 1 1000 |
stdout nice nice parallel -s 100 -uj0 true |
perl -pe '/parallel: Warning: Starting \d+ processes took/ and do {close STDIN; `killall -9 burnP6`; print "OK\n"; exit }'
killall -9 burnP6 2>/dev/null
echo '### Test --env - https://savannah.gnu.org/bugs/?37351'
export TWOSPACES=' 2 spaces '
export THREESPACES=" > My brother's 12\" records < "
stdout parallel --env TWOSPACES echo 'a"$TWOSPACES"b' ::: 1
stdout parallel --env TWOSPACES --env THREESPACES echo 'a"$TWOSPACES"b' 'a"$THREESPACES"b' ::: 2
stdout parallel --env TWOSPACES,THREESPACES echo 'a"$TWOSPACES"b' 'a"$THREESPACES"b' ::: 2a
stdout parallel -S localhost --env TWOSPACES echo 'a"$TWOSPACES"b' ::: 1
stdout parallel -S localhost --env TWOSPACES --env THREESPACES echo 'a"$TWOSPACES"b' 'a"$THREESPACES"b' ::: 2
stdout parallel -S localhost --env TWOSPACES,THREESPACES echo 'a"$TWOSPACES"b' 'a"$THREESPACES"b' ::: 2a
echo '### Test --env all chars except \n - single and double - no output is good'
perl -e 'for(1..9,9,11..255) { printf "%c%c %c%d\0",$_,$_,$_,$_ }' | stdout parallel --nice 19 -j4 -k -I // --arg-sep _ -0 V=// V2=V2=// parallel -k -j1 -S :,1/lo,1/tcsh@lo,1/csh@lo --env V,V2 echo \''"{}$V$V2"'\' ::: {#} {#} {#} {#} | uniq -c | grep -v ' 4 '|grep -v xauth |grep -v X11
echo '### Test --env all chars except \n - single and double --onall - no output is good'
perl -e 'for(1..9,9,11..255) { printf "%c%c %c%d\0",$_,$_,$_,$_ }' | stdout parallel --nice 19 -j4 -k -I // --arg-sep _ -0 V=// V2=V2=// parallel -k -j1 -S :,1/lo,1/tcsh@lo,1/csh@lo --onall --env V,V2 echo \''"{}$V$V2"'\' ::: {#} | uniq -c | grep -v ' 4 '|grep -v xauth |grep -v X11

View file

@ -46,3 +46,12 @@ a' * ? >o <i*? ][\!#¤%=( ) | }b 5
### Test --env for \160 - which kills csh - single and double --onall - no output is good
### Test too slow spawning
OK
### Test --env - https://savannah.gnu.org/bugs/?37351
a 2 spaces b 1
a 2 spaces b a > My brother's 12" records < b 2
a 2 spaces b a > My brother's 12" records < b 2a
a 2 spaces b 1
a 2 spaces b a > My brother's 12" records < b 2
a 2 spaces b a > My brother's 12" records < b 2a
### Test --env all chars except \n - single and double - no output is good
### Test --env all chars except \n - single and double --onall - no output is good